add docstrings to similarity methods

2016-04-09 15:57:11 +02:00 · 2016-04-09 15:57:11 +02:00 · 4df159ba7e
commit 4df159ba7e
parent 53fda5caaf
2 changed files with 121 additions and 34 deletions
--- a/app/lib/Similarity.inc
+++ b/app/lib/Similarity.inc
@ -22,8 +22,13 @@


        /**
-         * TODO readDocument()
-         * used
+         * Read a file and return its text.
+         *
+         * Currently only PDF-files are supported and “pdftotext” needs to be
+         * installed. If reading fails, false is returned.
+         *
+         * @param   string  $filename   Name of file to read
+         * @return  mixed               Text of document (string) or false (boolean)
         */
        public static function readDocument($filename)
        {
@ -46,12 +51,16 @@


        /**
-         * TODO splitNgrams()
-         * used
+         * Split a text into N-grams.
+         *
+         * The default N is 3.
+         *
+         * @param   string  $document   Text to be splitted
+         * @param   int     $n          Size of grams to split into (N)
+         * @return  array               List of n-grams
         */
-        public static function splitNgrams($document)
+        public static function splitNgrams($document, $n=3)
        {
-            $n = 3;
            $affix = implode(' ', array_fill(0, $n-1, ' '));
            $document = $affix.$document.$affix;
            $ngrams = array();
@ -65,8 +74,17 @@


        /**
-         * TODO compare()
-         * used
+         * Compare to documents, represented by there Term Frequencies (TFs)
+         * values.
+         *
+         * $tfsA, $tfsB and $idf_n are expected to be associative arrays with
+         * the term as key and the corresponding frequency as value.
+         *
+         * @param   array   $tfsA   Term Frequencies of document A
+         * @param   array   $tfsB   Term Frequencies of document B
+         * @param   int     $idf_N  Total count of documents in corpus
+         * @param   array   $idf_n  Inverse Document Frequencies of all terms
+         * @return  float           Similarity value (between 0.0 and 1.0)
         */
        public static function compare($tfsA, $tfsB, $idf_N, $idf_n)
        {
@ -88,8 +106,17 @@


        /**
-         * TODO getVector()
-         * used
+         * Calculate the vector for a document based on TF and IDF.
+         *
+         * $tfs and $idf_n are expected to be associative arrays with the term
+         * as key and the corresponding frequency as value. The resulting
+         * vector is an associative array with the terms as keys and their
+         * corresponding values as value.
+         *
+         * @param   array   $tfs    Term Frequencies of document
+         * @param   int     $idf_N  Total count of documents in corpus
+         * @param   array   $idf_n  Inverse Document Frequencies of all terms
+         * @return  array           Document vector
         */
        protected static function getVector($tfs, $idf_N, $idf_n)
        {
@ -101,9 +128,18 @@
        }


-        /**
-         * TODO getTFIDFs()
-         * used
+        /*
+         * Calculate TF*IDF values for a document.
+         *
+         * $tfs and $idf_n are expected to be associative arrays with the term
+         * as key and the corresponding frequency as value. The resulting
+         * value is an associative array with the terms as keys and their
+         * corresponding TF*IDF as values.
+         *
+         * @param   array   $tfs    Term Frequencies of document
+         * @param   int     $idf_N  Total count of documents in corpus
+         * @param   array   $idf_n  Inverse Document Frequencies of all terms
+         * @return  array           TF*IDF values
         */
        protected static function getTFIDFs($tfs, $idf_N, $idf_n)
        {
@ -128,9 +164,13 @@


        /**
-         * TODO cosinus()
+         * Calculate cosinus similarity between two vectors.
+         *
         * sim(a, b) = (a・b) / (||a|| * ||b||)
-         * used
+         *
+         * @param   array   $a  Vector A
+         * @param   array   $b  Vector B
+         * @return  float       Similarity value (between 0.0 and 1.0)
         */
        protected static function cosinus(array $a, array $b)
        {
@ -146,9 +186,13 @@


        /**
-         * TODO Dot product
+         * Calculate the dot-product for two vectors.
+         * 
         * a・b = summation{i=1,n}(a[i] * b[i])
-         * used
+         *
+         * @param   array   $a  Vector A
+         * @param   array   $b  Vector B
+         * @return  float       Dot-product
         */
        protected static function dotProduct(array $a, array $b)
        {
@ -156,10 +200,8 @@
            $keysA = array_keys(array_filter($a));
            $keysB = array_keys(array_filter($b));
            $uniqueKeys = array_unique(array_merge($keysA, $keysB));
-            foreach($uniqueKeys as $key)
-            {
-                if(!empty($a[$key]) && !empty($b[$key]))
-                {
+            foreach($uniqueKeys as $key) {
+                if(!empty($a[$key]) && !empty($b[$key])) {
                    $dotProduct += ($a[$key] * $b[$key]);
                }
            }
@ -169,9 +211,12 @@


        /**
-         * TODO Euclidean norm
+         * Caculate the Euclidean norm for a vector.
+         *
         * ||x|| = sqrt(x・x) // ・ is a dot product
-         * used 
+         *
+         * @param   array   $vector Vector
+         * @return  float           Euclidean norm
         */
        protected static function norm(array $vector)
        {
--- a/questtypes/submit/SubmitQuesttypeModel.inc
+++ b/questtypes/submit/SubmitQuesttypeModel.inc
@ -19,6 +19,13 @@
     */
    class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel
    {
+        /**
+         * Minimum similarity value for two submissions
+         *
+         * @var float
+         */
+        const SIMILARITY_MIN = 0.8;
+
        /**
         * Required models
         *
@ -171,7 +178,13 @@


        /**
-         * TODO getSimilarSubmissions()
+         * Get similar submissions for a Character submission.
+         *
+         * @param   int     $seminaryId     ID of Seminary
+         * @param   int     $questId        ID of Quest
+         * @param   int     $characterId    ID of Character
+         * @param   int     $submissionId   ID of submission
+         * @return  array                   List of submissions
         */
        public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId)
        {
@ -185,14 +198,17 @@
            // Get stored TFs of submission
            $tfsA = $this->getTFs($submissionId);

-            // Iterate through submissions of same task
+            // Get submissions of same task
            $submissions = $this->getSubmissionsForQuest(
                $questId,
                $characterId,
                $submissionId
            );
+
+            // Iterate through submissions of same task
            foreach($submissions as &$submission)
            {
+                // Check if similarity has already be calculated
                if(is_null($submission['similarity']))
                {
                    // Get stored TFs of submissions to compare to
@ -215,7 +231,7 @@
                }

                // Add high simnilarities to list
-                if($submission['similarity'] >= 0.7) {
+                if($submission['similarity'] >= self::SIMILARITY_MIN) {
                    $similarSubmissions[] = $submission;
                }
            }
@ -228,7 +244,10 @@


        /**
-         * TODO addDocument()
+         * Index a submission as document.
+         *
+         * @param   int     $submissionId   ID of submission
+         * @param   string  $filename       Full file path of document to read
         */
        private function addDocument($submissionId, $filename)
        {
@ -247,7 +266,10 @@


        /**
-         * TODO addTerms()
+         * Add terms to the corpus, stored in database.
+         *
+         * @param   int     $submissionId   ID of submission
+         * @param   array   $terms          List of (non-unique) terms
         */
        private function addTerms($submissionId, $terms)
        {
@ -286,6 +308,15 @@
        }


+        /**
+         * Get all submissions for a Quest including similarity values to the
+         * given submission, excluding the submissions of the given Character.
+         *
+         * @param   int     $questId        ID of Quest
+         * @param   int     $characterId    ID of Character to exclude submissions of
+         * @param   int     $submissionId   ID of submission to get similarity values for
+         * @return  array                   List of submissions
+         */
        private function getSubmissionsForQuest($questId, $characterId, $submissionId)
        {
            return $this->db->query(
@ -301,7 +332,10 @@


        /**
-         * TODO getTFs()
+         * Get Term Frequency (TF) values for a submission.
+         *
+         * @param   int     $submissionId   ID of submission
+         * @return  array                   Associative array with term as key and frequency as value
         */
        private function getTFs($submissionId)
        {
@ -328,8 +362,10 @@


        /**
-         * TODO getIDF_N()
-         * Total count of submissions (per Seminary)
+         * Get total count of submissions for a Seminary.
+         *
+         * @param   int $seminaryId ID of Seminary
+         * @return  int             Total count of submissions
         */
        private function getIDF_total($seminaryId)
        {
@ -352,8 +388,10 @@


        /**
-         * TODO getIDF_n()
-         * Count of submissions each term is in (per Seminary)
+         * Get count of submissions each term is in for a Seminary.
+         *
+         * @param   int     $seminaryId ID of Seminary
+         * @return  array   Associatve array wtih terms as keys and counts as values
         */
        private function getIDF_docs($seminaryId)
        {
@ -381,7 +419,11 @@


        /**
-         * TODO setSimilarity()
+         * Save the similarity of two submissions.
+         *
+         * @param   int     $submissionId1  ID of submission
+         * @param   int     $submissionId2  ID of submission
+         * @param   float   $similarity     Similarity of both submissions
         */
        private function setSimilarity($submissionId1, $submissionId2, $similarity)
        {