add docstrings to similarity methods

This commit is contained in:
oliver 2016-04-09 15:57:11 +02:00
commit 37d33bc636
2 changed files with 121 additions and 34 deletions

View file

@ -22,8 +22,13 @@
/** /**
* TODO readDocument() * Read a file and return its text.
* used *
* Currently only PDF-files are supported and “pdftotext” needs to be
* installed. If reading fails, false is returned.
*
* @param string $filename Name of file to read
* @return mixed Text of document (string) or false (boolean)
*/ */
public static function readDocument($filename) public static function readDocument($filename)
{ {
@ -46,12 +51,16 @@
/** /**
* TODO splitNgrams() * Split a text into N-grams.
* used *
* The default N is 3.
*
* @param string $document Text to be splitted
* @param int $n Size of grams to split into (N)
* @return array List of n-grams
*/ */
public static function splitNgrams($document) public static function splitNgrams($document, $n=3)
{ {
$n = 3;
$affix = implode(' ', array_fill(0, $n-1, ' ')); $affix = implode(' ', array_fill(0, $n-1, ' '));
$document = $affix.$document.$affix; $document = $affix.$document.$affix;
$ngrams = array(); $ngrams = array();
@ -65,8 +74,17 @@
/** /**
* TODO compare() * Compare to documents, represented by there Term Frequencies (TFs)
* used * values.
*
* $tfsA, $tfsB and $idf_n are expected to be associative arrays with
* the term as key and the corresponding frequency as value.
*
* @param array $tfsA Term Frequencies of document A
* @param array $tfsB Term Frequencies of document B
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return float Similarity value (between 0.0 and 1.0)
*/ */
public static function compare($tfsA, $tfsB, $idf_N, $idf_n) public static function compare($tfsA, $tfsB, $idf_N, $idf_n)
{ {
@ -88,8 +106,17 @@
/** /**
* TODO getVector() * Calculate the vector for a document based on TF and IDF.
* used *
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* vector is an associative array with the terms as keys and their
* corresponding values as value.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array Document vector
*/ */
protected static function getVector($tfs, $idf_N, $idf_n) protected static function getVector($tfs, $idf_N, $idf_n)
{ {
@ -101,9 +128,18 @@
} }
/** /*
* TODO getTFIDFs() * Calculate TF*IDF values for a document.
* used *
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* value is an associative array with the terms as keys and their
* corresponding TF*IDF as values.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array TF*IDF values
*/ */
protected static function getTFIDFs($tfs, $idf_N, $idf_n) protected static function getTFIDFs($tfs, $idf_N, $idf_n)
{ {
@ -128,9 +164,13 @@
/** /**
* TODO cosinus() * Calculate cosinus similarity between two vectors.
*
* sim(a, b) = (a・b) / (||a|| * ||b||) * sim(a, b) = (a・b) / (||a|| * ||b||)
* used *
* @param array $a Vector A
* @param array $b Vector B
* @return float Similarity value (between 0.0 and 1.0)
*/ */
protected static function cosinus(array $a, array $b) protected static function cosinus(array $a, array $b)
{ {
@ -146,9 +186,13 @@
/** /**
* TODO Dot product * Calculate the dot-product for two vectors.
*
* a・b = summation{i=1,n}(a[i] * b[i]) * a・b = summation{i=1,n}(a[i] * b[i])
* used *
* @param array $a Vector A
* @param array $b Vector B
* @return float Dot-product
*/ */
protected static function dotProduct(array $a, array $b) protected static function dotProduct(array $a, array $b)
{ {
@ -156,10 +200,8 @@
$keysA = array_keys(array_filter($a)); $keysA = array_keys(array_filter($a));
$keysB = array_keys(array_filter($b)); $keysB = array_keys(array_filter($b));
$uniqueKeys = array_unique(array_merge($keysA, $keysB)); $uniqueKeys = array_unique(array_merge($keysA, $keysB));
foreach($uniqueKeys as $key) foreach($uniqueKeys as $key) {
{ if(!empty($a[$key]) && !empty($b[$key])) {
if(!empty($a[$key]) && !empty($b[$key]))
{
$dotProduct += ($a[$key] * $b[$key]); $dotProduct += ($a[$key] * $b[$key]);
} }
} }
@ -169,9 +211,12 @@
/** /**
* TODO Euclidean norm * Caculate the Euclidean norm for a vector.
*
* ||x|| = sqrt(x・x) // ・ is a dot product * ||x|| = sqrt(x・x) // ・ is a dot product
* used *
* @param array $vector Vector
* @return float Euclidean norm
*/ */
protected static function norm(array $vector) protected static function norm(array $vector)
{ {

View file

@ -19,6 +19,13 @@
*/ */
class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel
{ {
/**
* Minimum similarity value for two submissions
*
* @var float
*/
const SIMILARITY_MIN = 0.8;
/** /**
* Required models * Required models
* *
@ -171,7 +178,13 @@
/** /**
* TODO getSimilarSubmissions() * Get similar submissions for a Character submission.
*
* @param int $seminaryId ID of Seminary
* @param int $questId ID of Quest
* @param int $characterId ID of Character
* @param int $submissionId ID of submission
* @return array List of submissions
*/ */
public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId) public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId)
{ {
@ -185,14 +198,17 @@
// Get stored TFs of submission // Get stored TFs of submission
$tfsA = $this->getTFs($submissionId); $tfsA = $this->getTFs($submissionId);
// Iterate through submissions of same task // Get submissions of same task
$submissions = $this->getSubmissionsForQuest( $submissions = $this->getSubmissionsForQuest(
$questId, $questId,
$characterId, $characterId,
$submissionId $submissionId
); );
// Iterate through submissions of same task
foreach($submissions as &$submission) foreach($submissions as &$submission)
{ {
// Check if similarity has already be calculated
if(is_null($submission['similarity'])) if(is_null($submission['similarity']))
{ {
// Get stored TFs of submissions to compare to // Get stored TFs of submissions to compare to
@ -215,7 +231,7 @@
} }
// Add high simnilarities to list // Add high simnilarities to list
if($submission['similarity'] >= 0.7) { if($submission['similarity'] >= self::SIMILARITY_MIN) {
$similarSubmissions[] = $submission; $similarSubmissions[] = $submission;
} }
} }
@ -228,7 +244,10 @@
/** /**
* TODO addDocument() * Index a submission as document.
*
* @param int $submissionId ID of submission
* @param string $filename Full file path of document to read
*/ */
private function addDocument($submissionId, $filename) private function addDocument($submissionId, $filename)
{ {
@ -247,7 +266,10 @@
/** /**
* TODO addTerms() * Add terms to the corpus, stored in database.
*
* @param int $submissionId ID of submission
* @param array $terms List of (non-unique) terms
*/ */
private function addTerms($submissionId, $terms) private function addTerms($submissionId, $terms)
{ {
@ -286,6 +308,15 @@
} }
/**
* Get all submissions for a Quest including similarity values to the
* given submission, excluding the submissions of the given Character.
*
* @param int $questId ID of Quest
* @param int $characterId ID of Character to exclude submissions of
* @param int $submissionId ID of submission to get similarity values for
* @return array List of submissions
*/
private function getSubmissionsForQuest($questId, $characterId, $submissionId) private function getSubmissionsForQuest($questId, $characterId, $submissionId)
{ {
return $this->db->query( return $this->db->query(
@ -301,7 +332,10 @@
/** /**
* TODO getTFs() * Get Term Frequency (TF) values for a submission.
*
* @param int $submissionId ID of submission
* @return array Associative array with term as key and frequency as value
*/ */
private function getTFs($submissionId) private function getTFs($submissionId)
{ {
@ -328,8 +362,10 @@
/** /**
* TODO getIDF_N() * Get total count of submissions for a Seminary.
* Total count of submissions (per Seminary) *
* @param int $seminaryId ID of Seminary
* @return int Total count of submissions
*/ */
private function getIDF_total($seminaryId) private function getIDF_total($seminaryId)
{ {
@ -352,8 +388,10 @@
/** /**
* TODO getIDF_n() * Get count of submissions each term is in for a Seminary.
* Count of submissions each term is in (per Seminary) *
* @param int $seminaryId ID of Seminary
* @return array Associatve array wtih terms as keys and counts as values
*/ */
private function getIDF_docs($seminaryId) private function getIDF_docs($seminaryId)
{ {
@ -381,7 +419,11 @@
/** /**
* TODO setSimilarity() * Save the similarity of two submissions.
*
* @param int $submissionId1 ID of submission
* @param int $submissionId2 ID of submission
* @param float $similarity Similarity of both submissions
*/ */
private function setSimilarity($submissionId1, $submissionId2, $similarity) private function setSimilarity($submissionId1, $submissionId2, $similarity)
{ {