questlab/app/lib/Similarity.inc

228 lines
6.8 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* Questlab
*
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
* @copyright 2014 2016 Heinrich-Heine-Universität Düsseldorf
* @license http://www.gnu.org/licenses/gpl.html
* @link https://github.com/coderkun/questlab
*/
namespace hhu\z\lib;
/**
* Class to calculate similarity between documents.
*
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
*/
class Similarity
{
/**
* Read a file and return its text.
*
* Currently only PDF-files are supported and “pdftotext” needs to be
* installed. If reading fails, false is returned.
*
* @param string $filename Name of file to read
* @return mixed Text of document (string) or false (boolean)
*/
public static function readDocument($filename)
{
if(!file_exists($filename)) {
return false;
}
$text = array();
$result = 0;
exec(sprintf('pdftotext "%s" -', $filename), $text, $result);
if($result != 0) {
return false;
}
$text = mb_strtolower(implode('', $text));
return $text;
}
/**
* Split a text into N-grams.
*
* The default N is 3.
*
* @param string $document Text to be splitted
* @param int $n Size of grams to split into (N)
* @return array List of n-grams
*/
public static function splitNgrams($document, $n=3)
{
$affix = implode(' ', array_fill(0, $n-1, ' '));
$document = $affix.$document.$affix;
$ngrams = array();
for($i=0; $i<mb_strlen($document)-$n; $i++) {
$ngrams[] = mb_substr($document, $i, $n);
}
return $ngrams;
}
/**
* Compare to documents, represented by there Term Frequencies (TFs)
* values.
*
* $tfsA, $tfsB and $idf_n are expected to be associative arrays with
* the term as key and the corresponding frequency as value.
*
* @param array $tfsA Term Frequencies of document A
* @param array $tfsB Term Frequencies of document B
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return float Similarity value (between 0.0 and 1.0)
*/
public static function compare($tfsA, $tfsB, $idf_N, $idf_n)
{
// Create vector A
$vectorA = self::getVector($tfsA, $idf_N, $idf_n);
// Create vector B
$vectorB = self::getVector($tfsB, $idf_N, $idf_n);
// Compare vectors
$result = self::cosinus($vectorA, $vectorB);
// Return result
return $result;
}
/**
* Calculate the vector for a document based on TF and IDF.
*
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* vector is an associative array with the terms as keys and their
* corresponding values as value.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array Document vector
*/
protected static function getVector($tfs, $idf_N, $idf_n)
{
// TF * IDF
$tfidfs = self::getTFIDFs($tfs, $idf_N, $idf_n);
return $tfidfs;
}
/*
* Calculate TF*IDF values for a document.
*
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* value is an associative array with the terms as keys and their
* corresponding TF*IDF as values.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array TF*IDF values
*/
protected static function getTFIDFs($tfs, $idf_N, $idf_n)
{
$tfidfs = array();
// Calculate TF*IDF
foreach($tfs as $term => &$tf)
{
if(array_key_exists($term, $idf_n)) {
$idf = log($idf_N / $idf_n[$term], 2);
}
else {
// TODO Laplace norm: n = 1?
$idf = log($idf_N / 1, 2);
}
$tfidfs[$term] = $tf * $idf;
}
return $tfidfs;
}
/**
* Calculate cosinus similarity between two vectors.
*
* sim(a, b) = (a・b) / (||a|| * ||b||)
*
* @param array $a Vector A
* @param array $b Vector B
* @return float Similarity value (between 0.0 and 1.0)
*/
protected static function cosinus(array $a, array $b)
{
$normA = self::norm($a);
$normB = self::norm($b);
if(($normA * $normB) != 0) {
return self::dotProduct($a, $b) / ($normA * $normB);
}
else {
return 0;
}
}
/**
* Calculate the dot-product for two vectors.
*
* a・b = summation{i=1,n}(a[i] * b[i])
*
* @param array $a Vector A
* @param array $b Vector B
* @return float Dot-product
*/
protected static function dotProduct(array $a, array $b)
{
$dotProduct = 0;
$keysA = array_keys(array_filter($a));
$keysB = array_keys(array_filter($b));
$uniqueKeys = array_unique(array_merge($keysA, $keysB));
foreach($uniqueKeys as $key) {
if(!empty($a[$key]) && !empty($b[$key])) {
$dotProduct += ($a[$key] * $b[$key]);
}
}
return $dotProduct;
}
/**
* Caculate the Euclidean norm for a vector.
*
* ||x|| = sqrt(x・x) // ・ is a dot product
*
* @param array $vector Vector
* @return float Euclidean norm
*/
protected static function norm(array $vector)
{
return sqrt(self::dotProduct($vector, $vector));
}
}
?>