* @copyright 2014 Heinrich-Heine-Universität Düsseldorf * @license http://www.gnu.org/licenses/gpl.html * @link https://bitbucket.org/coderkun/the-legend-of-z */ namespace hhu\z\lib; /** * Class to calculate similarity between documents. * * @author Oliver Hanraths */ class Similarity { /** * Read a file and return its text. * * Currently only PDF-files are supported and “pdftotext” needs to be * installed. If reading fails, false is returned. * * @param string $filename Name of file to read * @return mixed Text of document (string) or false (boolean) */ public static function readDocument($filename) { if(!file_exists($filename)) { return false; } $text = array(); $result = 0; exec(sprintf('pdftotext "%s" -', $filename), $text, $result); if($result != 0) { return false; } $text = mb_strtolower(implode('', $text)); return $text; } /** * Split a text into N-grams. * * The default N is 3. * * @param string $document Text to be splitted * @param int $n Size of grams to split into (N) * @return array List of n-grams */ public static function splitNgrams($document, $n=3) { $affix = implode(' ', array_fill(0, $n-1, ' ')); $document = $affix.$document.$affix; $ngrams = array(); for($i=0; $i &$tf) { if(array_key_exists($term, $idf_n)) { $idf = log($idf_N / $idf_n[$term], 2); } else { // TODO Laplace norm: n = 1? $idf = log($idf_N / 1, 2); } $tfidfs[$term] = $tf * $idf; } return $tfidfs; } /** * Calculate cosinus similarity between two vectors. * * sim(a, b) = (a・b) / (||a|| * ||b||) * * @param array $a Vector A * @param array $b Vector B * @return float Similarity value (between 0.0 and 1.0) */ protected static function cosinus(array $a, array $b) { $normA = self::norm($a); $normB = self::norm($b); if(($normA * $normB) != 0) { return self::dotProduct($a, $b) / ($normA * $normB); } else { return 0; } } /** * Calculate the dot-product for two vectors. * * a・b = summation{i=1,n}(a[i] * b[i]) * * @param array $a Vector A * @param array $b Vector B * @return float Dot-product */ protected static function dotProduct(array $a, array $b) { $dotProduct = 0; $keysA = array_keys(array_filter($a)); $keysB = array_keys(array_filter($b)); $uniqueKeys = array_unique(array_merge($keysA, $keysB)); foreach($uniqueKeys as $key) { if(!empty($a[$key]) && !empty($b[$key])) { $dotProduct += ($a[$key] * $b[$key]); } } return $dotProduct; } /** * Caculate the Euclidean norm for a vector. * * ||x|| = sqrt(x・x) // ・ is a dot product * * @param array $vector Vector * @return float Euclidean norm */ protected static function norm(array $vector) { return sqrt(self::dotProduct($vector, $vector)); } } ?>