questlab/www/analytics/core/Tracker/PageUrl.php
coderkun 046a724272 merge
2015-04-27 16:42:05 +02:00

328 lines
11 KiB
PHP

<?php
/**
* Piwik - Open source web analytics
*
* @link http://piwik.org
* @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
*
*/
namespace Piwik\Tracker;
use Piwik\Common;
use Piwik\Config;
use Piwik\UrlHelper;
class PageUrl
{
/**
* Map URL prefixes to integers.
* @see self::normalizeUrl(), self::reconstructNormalizedUrl()
*/
public static $urlPrefixMap = array(
'http://www.' => 1,
'http://' => 0,
'https://www.' => 3,
'https://' => 2
);
protected static $queryParametersToExclude = array('gclid', 'fb_xd_fragment', 'fb_comment_id',
'phpsessid', 'jsessionid', 'sessionid', 'aspsessionid',
'doing_wp_cron');
/**
* Given the Input URL, will exclude all query parameters set for this site
*
* @static
* @param $originalUrl
* @param $idSite
* @return bool|string
*/
public static function excludeQueryParametersFromUrl($originalUrl, $idSite)
{
$originalUrl = self::cleanupUrl($originalUrl);
$parsedUrl = @parse_url($originalUrl);
$parsedUrl = self::cleanupHostAndHashTag($parsedUrl, $idSite);
$parametersToExclude = self::getQueryParametersToExclude($idSite);
if (empty($parsedUrl['query'])) {
if (empty($parsedUrl['fragment'])) {
return UrlHelper::getParseUrlReverse($parsedUrl);
}
// Exclude from the hash tag as well
$queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['fragment']);
$parsedUrl['fragment'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
$url = UrlHelper::getParseUrlReverse($parsedUrl);
return $url;
}
$queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['query']);
$parsedUrl['query'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
$url = UrlHelper::getParseUrlReverse($parsedUrl);
return $url;
}
/**
* Returns the array of parameters names that must be excluded from the Query String in all tracked URLs
* @static
* @param $idSite
* @return array
*/
public static function getQueryParametersToExclude($idSite)
{
$campaignTrackingParameters = Common::getCampaignParameters();
$campaignTrackingParameters = array_merge(
$campaignTrackingParameters[0], // campaign name parameters
$campaignTrackingParameters[1] // campaign keyword parameters
);
$website = Cache::getCacheWebsiteAttributes($idSite);
$excludedParameters = isset($website['excluded_parameters'])
? $website['excluded_parameters']
: array();
if (!empty($excludedParameters)) {
Common::printDebug('Excluding parameters "' . implode(',', $excludedParameters) . '" from URL');
}
$parametersToExclude = array_merge($excludedParameters,
self::$queryParametersToExclude,
$campaignTrackingParameters);
$parametersToExclude = array_map('strtolower', $parametersToExclude);
return $parametersToExclude;
}
/**
* Returns true if URL fragments should be removed for a specific site,
* false if otherwise.
*
* This function uses the Tracker cache and not the MySQL database.
*
* @param $idSite int The ID of the site to check for.
* @return bool
*/
public static function shouldRemoveURLFragmentFor($idSite)
{
$websiteAttributes = Cache::getCacheWebsiteAttributes($idSite);
return !$websiteAttributes['keep_url_fragment'];
}
/**
* Cleans and/or removes the URL fragment of a URL.
*
* @param $urlFragment string The URL fragment to process.
* @param $idSite int|bool If not false, this function will check if URL fragments
* should be removed for the site w/ this ID and if so,
* the returned processed fragment will be empty.
*
* @return string The processed URL fragment.
*/
public static function processUrlFragment($urlFragment, $idSite = false)
{
// if we should discard the url fragment for this site, return an empty string as
// the processed url fragment
if ($idSite !== false
&& PageUrl::shouldRemoveURLFragmentFor($idSite)
) {
return '';
} else {
// Remove trailing Hash tag in ?query#hash#
if (substr($urlFragment, -1) == '#') {
$urlFragment = substr($urlFragment, 0, strlen($urlFragment) - 1);
}
return $urlFragment;
}
}
/**
* Will cleanup the hostname (some browser do not strolower the hostname),
* and deal ith the hash tag on incoming URLs based on website setting.
*
* @param $parsedUrl
* @param $idSite int|bool The site ID of the current visit. This parameter is
* only used by the tracker to see if we should remove
* the URL fragment for this site.
* @return array
*/
protected static function cleanupHostAndHashTag($parsedUrl, $idSite = false)
{
if (empty($parsedUrl)) {
return $parsedUrl;
}
if (!empty($parsedUrl['host'])) {
$parsedUrl['host'] = mb_strtolower($parsedUrl['host'], 'UTF-8');
}
if (!empty($parsedUrl['fragment'])) {
$parsedUrl['fragment'] = PageUrl::processUrlFragment($parsedUrl['fragment'], $idSite);
}
return $parsedUrl;
}
/**
* Converts Matrix URL format
* from http://example.org/thing;paramA=1;paramB=6542
* to http://example.org/thing?paramA=1&paramB=6542
*
* @param string $originalUrl
* @return string
*/
public static function convertMatrixUrl($originalUrl)
{
$posFirstSemiColon = strpos($originalUrl, ";");
if ($posFirstSemiColon === false) {
return $originalUrl;
}
$posQuestionMark = strpos($originalUrl, "?");
$replace = ($posQuestionMark === false);
if ($posQuestionMark > $posFirstSemiColon) {
$originalUrl = substr_replace($originalUrl, ";", $posQuestionMark, 1);
$replace = true;
}
if ($replace) {
$originalUrl = substr_replace($originalUrl, "?", strpos($originalUrl, ";"), 1);
$originalUrl = str_replace(";", "&", $originalUrl);
}
return $originalUrl;
}
/**
* Clean up string contents (filter, truncate, ...)
*
* @param string $string Dirty string
* @return string
*/
public static function cleanupString($string)
{
$string = trim($string);
$string = str_replace(array("\n", "\r", "\0"), '', $string);
$limit = Config::getInstance()->Tracker['page_maximum_length'];
$clean = substr($string, 0, $limit);
return $clean;
}
protected static function reencodeParameterValue($value, $encoding)
{
if (is_string($value)) {
$decoded = urldecode($value);
if (@mb_check_encoding($decoded, $encoding)) {
$value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding));
}
}
return $value;
}
protected static function reencodeParametersArray($queryParameters, $encoding)
{
foreach ($queryParameters as &$value) {
if (is_array($value)) {
$value = self::reencodeParametersArray($value, $encoding);
} else {
$value = PageUrl::reencodeParameterValue($value, $encoding);
}
}
return $queryParameters;
}
/**
* Checks if query parameters are of a non-UTF-8 encoding and converts the values
* from the specified encoding to UTF-8.
* This method is used to workaround browser/webapp bugs (see #3450). When
* browsers fail to encode query parameters in UTF-8, the tracker will send the
* charset of the page viewed and we can sometimes work around invalid data
* being stored.
*
* @param array $queryParameters Name/value mapping of query parameters.
* @param bool|string $encoding of the HTML page the URL is for. Used to workaround
* browser bugs & mis-coded webapps. See #3450.
*
* @return array
*/
public static function reencodeParameters(&$queryParameters, $encoding = false)
{
// if query params are encoded w/ non-utf8 characters (due to browser bug or whatever),
// encode to UTF-8.
if ($encoding !== false
&& strtolower($encoding) != 'utf-8'
&& function_exists('mb_check_encoding')
) {
$queryParameters = PageUrl::reencodeParametersArray($queryParameters, $encoding);
}
return $queryParameters;
}
public static function cleanupUrl($url)
{
$url = Common::unsanitizeInputValue($url);
$url = PageUrl::cleanupString($url);
$url = PageUrl::convertMatrixUrl($url);
return $url;
}
/**
* Build the full URL from the prefix ID and the rest.
*
* @param string $url
* @param integer $prefixId
* @return string
*/
public static function reconstructNormalizedUrl($url, $prefixId)
{
$map = array_flip(self::$urlPrefixMap);
if ($prefixId !== null && isset($map[$prefixId])) {
$fullUrl = $map[$prefixId] . $url;
} else {
$fullUrl = $url;
}
// Clean up host & hash tags, for URLs
$parsedUrl = @parse_url($fullUrl);
$parsedUrl = PageUrl::cleanupHostAndHashTag($parsedUrl);
$url = UrlHelper::getParseUrlReverse($parsedUrl);
if (!empty($url)) {
return $url;
}
return $fullUrl;
}
/**
* Extract the prefix from a URL.
* Return the prefix ID and the rest.
*
* @param string $url
* @return array
*/
public static function normalizeUrl($url)
{
foreach (self::$urlPrefixMap as $prefix => $id) {
if (strtolower(substr($url, 0, strlen($prefix))) == $prefix) {
return array(
'url' => substr($url, strlen($prefix)),
'prefixId' => $id
);
}
}
return array('url' => $url, 'prefixId' => null);
}
public static function getUrlIfLookValid($url)
{
$url = PageUrl::cleanupString($url);
if (!UrlHelper::isLookLikeUrl($url)) {
Common::printDebug("WARNING: URL looks invalid and is discarded");
$url = false;
return $url;
}
return $url;
}
}