From 4df159ba7ee581cce8f8c293d494f797c777ccb7 Mon Sep 17 00:00:00 2001 From: oliver Date: Sat, 9 Apr 2016 15:57:11 +0200 Subject: [PATCH] add docstrings to similarity methods --- app/lib/Similarity.inc | 91 ++++++++++++++++------ questtypes/submit/SubmitQuesttypeModel.inc | 64 ++++++++++++--- 2 files changed, 121 insertions(+), 34 deletions(-) diff --git a/app/lib/Similarity.inc b/app/lib/Similarity.inc index da184e8b..fa27c986 100644 --- a/app/lib/Similarity.inc +++ b/app/lib/Similarity.inc @@ -22,8 +22,13 @@ /** - * TODO readDocument() - * used + * Read a file and return its text. + * + * Currently only PDF-files are supported and “pdftotext” needs to be + * installed. If reading fails, false is returned. + * + * @param string $filename Name of file to read + * @return mixed Text of document (string) or false (boolean) */ public static function readDocument($filename) { @@ -46,12 +51,16 @@ /** - * TODO splitNgrams() - * used + * Split a text into N-grams. + * + * The default N is 3. + * + * @param string $document Text to be splitted + * @param int $n Size of grams to split into (N) + * @return array List of n-grams */ - public static function splitNgrams($document) + public static function splitNgrams($document, $n=3) { - $n = 3; $affix = implode(' ', array_fill(0, $n-1, ' ')); $document = $affix.$document.$affix; $ngrams = array(); @@ -65,8 +74,17 @@ /** - * TODO compare() - * used + * Compare to documents, represented by there Term Frequencies (TFs) + * values. + * + * $tfsA, $tfsB and $idf_n are expected to be associative arrays with + * the term as key and the corresponding frequency as value. + * + * @param array $tfsA Term Frequencies of document A + * @param array $tfsB Term Frequencies of document B + * @param int $idf_N Total count of documents in corpus + * @param array $idf_n Inverse Document Frequencies of all terms + * @return float Similarity value (between 0.0 and 1.0) */ public static function compare($tfsA, $tfsB, $idf_N, $idf_n) { @@ -88,8 +106,17 @@ /** - * TODO getVector() - * used + * Calculate the vector for a document based on TF and IDF. + * + * $tfs and $idf_n are expected to be associative arrays with the term + * as key and the corresponding frequency as value. The resulting + * vector is an associative array with the terms as keys and their + * corresponding values as value. + * + * @param array $tfs Term Frequencies of document + * @param int $idf_N Total count of documents in corpus + * @param array $idf_n Inverse Document Frequencies of all terms + * @return array Document vector */ protected static function getVector($tfs, $idf_N, $idf_n) { @@ -101,9 +128,18 @@ } - /** - * TODO getTFIDFs() - * used + /* + * Calculate TF*IDF values for a document. + * + * $tfs and $idf_n are expected to be associative arrays with the term + * as key and the corresponding frequency as value. The resulting + * value is an associative array with the terms as keys and their + * corresponding TF*IDF as values. + * + * @param array $tfs Term Frequencies of document + * @param int $idf_N Total count of documents in corpus + * @param array $idf_n Inverse Document Frequencies of all terms + * @return array TF*IDF values */ protected static function getTFIDFs($tfs, $idf_N, $idf_n) { @@ -128,9 +164,13 @@ /** - * TODO cosinus() + * Calculate cosinus similarity between two vectors. + * * sim(a, b) = (a・b) / (||a|| * ||b||) - * used + * + * @param array $a Vector A + * @param array $b Vector B + * @return float Similarity value (between 0.0 and 1.0) */ protected static function cosinus(array $a, array $b) { @@ -146,9 +186,13 @@ /** - * TODO Dot product + * Calculate the dot-product for two vectors. + * * a・b = summation{i=1,n}(a[i] * b[i]) - * used + * + * @param array $a Vector A + * @param array $b Vector B + * @return float Dot-product */ protected static function dotProduct(array $a, array $b) { @@ -156,10 +200,8 @@ $keysA = array_keys(array_filter($a)); $keysB = array_keys(array_filter($b)); $uniqueKeys = array_unique(array_merge($keysA, $keysB)); - foreach($uniqueKeys as $key) - { - if(!empty($a[$key]) && !empty($b[$key])) - { + foreach($uniqueKeys as $key) { + if(!empty($a[$key]) && !empty($b[$key])) { $dotProduct += ($a[$key] * $b[$key]); } } @@ -169,9 +211,12 @@ /** - * TODO Euclidean norm + * Caculate the Euclidean norm for a vector. + * * ||x|| = sqrt(x・x) // ・ is a dot product - * used + * + * @param array $vector Vector + * @return float Euclidean norm */ protected static function norm(array $vector) { diff --git a/questtypes/submit/SubmitQuesttypeModel.inc b/questtypes/submit/SubmitQuesttypeModel.inc index f760438d..98d618bb 100644 --- a/questtypes/submit/SubmitQuesttypeModel.inc +++ b/questtypes/submit/SubmitQuesttypeModel.inc @@ -19,6 +19,13 @@ */ class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel { + /** + * Minimum similarity value for two submissions + * + * @var float + */ + const SIMILARITY_MIN = 0.8; + /** * Required models * @@ -171,7 +178,13 @@ /** - * TODO getSimilarSubmissions() + * Get similar submissions for a Character submission. + * + * @param int $seminaryId ID of Seminary + * @param int $questId ID of Quest + * @param int $characterId ID of Character + * @param int $submissionId ID of submission + * @return array List of submissions */ public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId) { @@ -185,14 +198,17 @@ // Get stored TFs of submission $tfsA = $this->getTFs($submissionId); - // Iterate through submissions of same task + // Get submissions of same task $submissions = $this->getSubmissionsForQuest( $questId, $characterId, $submissionId ); + + // Iterate through submissions of same task foreach($submissions as &$submission) { + // Check if similarity has already be calculated if(is_null($submission['similarity'])) { // Get stored TFs of submissions to compare to @@ -215,7 +231,7 @@ } // Add high simnilarities to list - if($submission['similarity'] >= 0.7) { + if($submission['similarity'] >= self::SIMILARITY_MIN) { $similarSubmissions[] = $submission; } } @@ -228,7 +244,10 @@ /** - * TODO addDocument() + * Index a submission as document. + * + * @param int $submissionId ID of submission + * @param string $filename Full file path of document to read */ private function addDocument($submissionId, $filename) { @@ -247,7 +266,10 @@ /** - * TODO addTerms() + * Add terms to the corpus, stored in database. + * + * @param int $submissionId ID of submission + * @param array $terms List of (non-unique) terms */ private function addTerms($submissionId, $terms) { @@ -286,6 +308,15 @@ } + /** + * Get all submissions for a Quest including similarity values to the + * given submission, excluding the submissions of the given Character. + * + * @param int $questId ID of Quest + * @param int $characterId ID of Character to exclude submissions of + * @param int $submissionId ID of submission to get similarity values for + * @return array List of submissions + */ private function getSubmissionsForQuest($questId, $characterId, $submissionId) { return $this->db->query( @@ -301,7 +332,10 @@ /** - * TODO getTFs() + * Get Term Frequency (TF) values for a submission. + * + * @param int $submissionId ID of submission + * @return array Associative array with term as key and frequency as value */ private function getTFs($submissionId) { @@ -328,8 +362,10 @@ /** - * TODO getIDF_N() - * Total count of submissions (per Seminary) + * Get total count of submissions for a Seminary. + * + * @param int $seminaryId ID of Seminary + * @return int Total count of submissions */ private function getIDF_total($seminaryId) { @@ -352,8 +388,10 @@ /** - * TODO getIDF_n() - * Count of submissions each term is in (per Seminary) + * Get count of submissions each term is in for a Seminary. + * + * @param int $seminaryId ID of Seminary + * @return array Associatve array wtih terms as keys and counts as values */ private function getIDF_docs($seminaryId) { @@ -381,7 +419,11 @@ /** - * TODO setSimilarity() + * Save the similarity of two submissions. + * + * @param int $submissionId1 ID of submission + * @param int $submissionId2 ID of submission + * @param float $similarity Similarity of both submissions */ private function setSimilarity($submissionId1, $submissionId2, $similarity) {