From 53fda5caafd64294161c15975165f85d1fa88905 Mon Sep 17 00:00:00 2001 From: oliver Date: Sat, 9 Apr 2016 13:21:23 +0200 Subject: [PATCH] implement similarity algorithm for questtype ?Submit? --- app/lib/Similarity.inc | 183 +++++++++++++ db/create.sql | 55 +++- .../submit/SubmitQuesttypeController.inc | 23 +- questtypes/submit/SubmitQuesttypeModel.inc | 245 ++++++++++++++++++ questtypes/submit/html/submission.tpl | 31 ++- www/css/desktop.css | 1 + 6 files changed, 533 insertions(+), 5 deletions(-) create mode 100644 app/lib/Similarity.inc diff --git a/app/lib/Similarity.inc b/app/lib/Similarity.inc new file mode 100644 index 00000000..da184e8b --- /dev/null +++ b/app/lib/Similarity.inc @@ -0,0 +1,183 @@ + + * @copyright 2014 Heinrich-Heine-Universität Düsseldorf + * @license http://www.gnu.org/licenses/gpl.html + * @link https://bitbucket.org/coderkun/the-legend-of-z + */ + + namespace hhu\z\lib; + + + /** + * Class to calculate similarity between documents. + * + * @author Oliver Hanraths + */ + class Similarity + { + + + /** + * TODO readDocument() + * used + */ + public static function readDocument($filename) + { + if(!file_exists($filename)) { + return false; + } + + $text = array(); + $result = 0; + exec(sprintf('pdftotext "%s" -', $filename), $text, $result); + if($result != 0) { + return false; + } + + $text = mb_strtolower(implode('', $text)); + + + return $text; + } + + + /** + * TODO splitNgrams() + * used + */ + public static function splitNgrams($document) + { + $n = 3; + $affix = implode(' ', array_fill(0, $n-1, ' ')); + $document = $affix.$document.$affix; + $ngrams = array(); + for($i=0; $i &$tf) + { + if(array_key_exists($term, $idf_n)) { + $idf = log($idf_N / $idf_n[$term], 2); + } + else { + // TODO Laplace norm: n = 1? + $idf = log($idf_N / 1, 2); + } + $tfidfs[$term] = $tf * $idf; + } + + + return $tfidfs; + } + + + /** + * TODO cosinus() + * sim(a, b) = (a・b) / (||a|| * ||b||) + * used + */ + protected static function cosinus(array $a, array $b) + { + $normA = self::norm($a); + $normB = self::norm($b); + if(($normA * $normB) != 0) { + return self::dotProduct($a, $b) / ($normA * $normB); + } + else { + return 0; + } + } + + + /** + * TODO Dot product + * a・b = summation{i=1,n}(a[i] * b[i]) + * used + */ + protected static function dotProduct(array $a, array $b) + { + $dotProduct = 0; + $keysA = array_keys(array_filter($a)); + $keysB = array_keys(array_filter($b)); + $uniqueKeys = array_unique(array_merge($keysA, $keysB)); + foreach($uniqueKeys as $key) + { + if(!empty($a[$key]) && !empty($b[$key])) + { + $dotProduct += ($a[$key] * $b[$key]); + } + } + + return $dotProduct; + } + + + /** + * TODO Euclidean norm + * ||x|| = sqrt(x・x) // ・ is a dot product + * used + */ + protected static function norm(array $vector) + { + return sqrt(self::dotProduct($vector, $vector)); + } + + } + +?> diff --git a/db/create.sql b/db/create.sql index d695be29..eb88e859 100644 --- a/db/create.sql +++ b/db/create.sql @@ -1888,6 +1888,59 @@ CREATE TABLE `questtypes_submit_mimetypes` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; /*!40101 SET character_set_client = @saved_cs_client */; +-- +-- Table structure for table `questtypes_submit_similarities` +-- + +DROP TABLE IF EXISTS `questtypes_submit_similarities`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `questtypes_submit_similarities` ( + `submission_id1` int(11) NOT NULL, + `submission_id2` int(11) NOT NULL, + `created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `similarity` decimal(10,9) NOT NULL, + PRIMARY KEY (`submission_id1`,`submission_id2`), + KEY `submission_id2` (`submission_id2`), + CONSTRAINT `questtypes_submit_similarities_ibfk_1` FOREIGN KEY (`submission_id1`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + CONSTRAINT `questtypes_submit_similarities_ibfk_2` FOREIGN KEY (`submission_id2`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `questtypes_submit_submissions_terms` +-- + +DROP TABLE IF EXISTS `questtypes_submit_submissions_terms`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `questtypes_submit_submissions_terms` ( + `submission_id` int(11) NOT NULL, + `term_id` int(11) NOT NULL, + `created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `tf` mediumint(8) unsigned NOT NULL DEFAULT '1', + PRIMARY KEY (`submission_id`,`term_id`), + KEY `term_id` (`term_id`), + CONSTRAINT `questtypes_submit_submissions_terms_ibfk_1` FOREIGN KEY (`submission_id`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + CONSTRAINT `questtypes_submit_submissions_terms_ibfk_2` FOREIGN KEY (`term_id`) REFERENCES `questtypes_submit_terms` (`id`) ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `questtypes_submit_terms` +-- + +DROP TABLE IF EXISTS `questtypes_submit_terms`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `questtypes_submit_terms` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `term` varchar(9) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `term` (`term`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Terms/N-grams'; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `questtypes_textinput` -- @@ -2663,4 +2716,4 @@ DELIMITER ; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2016-03-26 19:13:31 +-- Dump completed on 2016-04-09 13:18:45 diff --git a/questtypes/submit/SubmitQuesttypeController.inc b/questtypes/submit/SubmitQuesttypeController.inc index d634e096..5cc17f0a 100644 --- a/questtypes/submit/SubmitQuesttypeController.inc +++ b/questtypes/submit/SubmitQuesttypeController.inc @@ -24,7 +24,7 @@ * * @var array */ - public $models = array('quests', 'uploads', 'users'); + public $models = array('quests', 'uploads', 'users', 'characters', 'questgroups'); @@ -210,6 +210,27 @@ catch(\nre\exceptions\IdNotFoundException $e) { } } + $submission['similar'] = $this->Submit->getSimilarSubmissions( + $seminary['id'], + $quest['id'], + $character['id'], + $submission['id'] + ); + foreach($submission['similar'] as &$similarSubmission) + { + $similarSubmission['quest'] = $this->Quests->getQuestById( + $similarSubmission['quest_id'] + ); + $similarSubmission['questgroup'] = $this->Questgroups->getQuestgroupById( + $similarSubmission['quest']['questgroup_id'] + ); + $similarSubmission['character'] = $this->Characters->getCharacterById( + $similarSubmission['character_id'] + ); + $similarSubmission['upload'] = $this->Uploads->getSeminaryuploadById( + $similarSubmission['upload_id'] + ); + } } // Status diff --git a/questtypes/submit/SubmitQuesttypeModel.inc b/questtypes/submit/SubmitQuesttypeModel.inc index b3b8f7bb..f760438d 100644 --- a/questtypes/submit/SubmitQuesttypeModel.inc +++ b/questtypes/submit/SubmitQuesttypeModel.inc @@ -81,6 +81,12 @@ $questId, $characterId, $uploadId ); + // Index submission for similarity calculation + $this->addDocument( + $this->db->getInsertId(), + ROOT.DS.\nre\configs\AppConfig::$dirs['seminaryuploads'].DS.$filename + ); + return true; } @@ -163,6 +169,245 @@ ); } + + /** + * TODO getSimilarSubmissions() + */ + public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId) + { + // List of submissions with high similarity + $similarSubmissions = array(); + + // Get IDFs + $idf_N = $this->getIDF_total($seminaryId); + $idf_n = $this->getIDF_docs($seminaryId); + + // Get stored TFs of submission + $tfsA = $this->getTFs($submissionId); + + // Iterate through submissions of same task + $submissions = $this->getSubmissionsForQuest( + $questId, + $characterId, + $submissionId + ); + foreach($submissions as &$submission) + { + if(is_null($submission['similarity'])) + { + // Get stored TFs of submissions to compare to + $tfsB = $this->getTFs($submission['id']); + + // Calculate similarity + $submission['similarity'] = \hhu\z\lib\Similarity::compare( + $tfsA, + $tfsB, + $idf_N, + $idf_n + ); + + // Save similarity + $this->setSimilarity( + $submissionId, + $submission['id'], + $submission['similarity'] + ); + } + + // Add high simnilarities to list + if($submission['similarity'] >= 0.7) { + $similarSubmissions[] = $submission; + } + } + + + return $similarSubmissions; + } + + + + + /** + * TODO addDocument() + */ + private function addDocument($submissionId, $filename) + { + // Read document + $document = \hhu\z\lib\Similarity::readDocument($filename); + if($document === false) { + return false; + } + + // Split document into terms + $terms = \hhu\z\lib\Similarity::splitNgrams($document); + + // Update global values + $this->addTerms($submissionId, $terms); + } + + + /** + * TODO addTerms() + */ + private function addTerms($submissionId, $terms) + { + // Calculate IDF: n (n_term) + $uniqueTerms = array(); + foreach($terms as &$term) + { + if(!in_array($term, $uniqueTerms)) + { + // Add term to database + $this->db->query( + 'INSERT IGNORE INTO questtypes_submit_terms '. + '(term) '. + 'VALUES '. + '(?)', + 's', + $term + ); + $uniqueTerms[] = $term; + } + + // Link term to submission + $this->db->query( + 'INSERT INTO questtypes_submit_submissions_terms '. + '(submission_id, term_id, tf) '. + 'SELECT ?, questtypes_submit_terms.id, 1 '. + 'FROM questtypes_submit_terms '. + 'WHERE term = ? '. + 'ON DUPLICATE KEY UPDATE '. + 'tf = tf + 1', + 'is', + $submissionId, + $term + ); + } + } + + + private function getSubmissionsForQuest($questId, $characterId, $submissionId) + { + return $this->db->query( + 'SELECT questtypes_submit_characters.id, questtypes_submit_characters.created, questtypes_submit_characters.quest_id, character_id, upload_id, questtypes_submit_similarities.similarity '. + 'FROM questtypes_submit_characters '. + 'LEFT JOIN questtypes_submit_similarities ON questtypes_submit_similarities.submission_id1 = ? AND questtypes_submit_similarities.submission_id2 = questtypes_submit_characters.id '. + 'WHERE quest_id = ? AND character_id != ?', + 'iii', + $submissionId, + $questId, $characterId + ); + } + + + /** + * TODO getTFs() + */ + private function getTFs($submissionId) + { + // Read terms + $terms = $this->db->query( + 'SELECT term, tf '. + 'FROM questtypes_submit_submissions_terms '. + 'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '. + 'WHERE submission_id = ?', + 'i', + $submissionId + ); + + // Convert to TFs + $tfs = array(); + foreach($terms as &$term) { + $tfs[$term['term']] = $term['tf']; + } + + + // Return TFs + return $tfs; + } + + + /** + * TODO getIDF_N() + * Total count of submissions (per Seminary) + */ + private function getIDF_total($seminaryId) + { + $data = $this->db->query( + 'SELECT count(questtypes_submit_characters.id) as c '. + 'FROM charactertypes '. + 'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '. + 'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '. + 'WHERE charactertypes.seminary_id = ?', + 'i', + $seminaryId + ); + if(!empty($data)) { + return $data[0]['c']; + } + + + return 0; + } + + + /** + * TODO getIDF_n() + * Count of submissions each term is in (per Seminary) + */ + private function getIDF_docs($seminaryId) + { + $terms = $this->db->query( + 'SELECT questtypes_submit_terms.term, count(*) AS c '. + 'FROM charactertypes '. + 'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '. + 'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '. + 'INNER JOIN questtypes_submit_submissions_terms ON questtypes_submit_submissions_terms.submission_id = questtypes_submit_characters.id '. + 'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '. + 'WHERE charactertypes.seminary_id = ? '. + 'GROUP BY questtypes_submit_terms.term', + 'i', + $seminaryId + ); + + $idfs = array(); + foreach($terms as &$term) { + $idfs[$term['term']] = $term['c']; + } + + + return $idfs; + } + + + /** + * TODO setSimilarity() + */ + private function setSimilarity($submissionId1, $submissionId2, $similarity) + { + $this->db->query( + 'INSERT INTO questtypes_submit_similarities '. + '(submission_id1, submission_id2, similarity) '. + 'VALUES '. + '(?, ?, ?) '. + 'ON DUPLICATE KEY UPDATE '. + 'similarity = ?', + 'iidd', + $submissionId1, $submissionId2, $similarity, + $similarity + ); + $this->db->query( + 'INSERT INTO questtypes_submit_similarities '. + '(submission_id1, submission_id2, similarity) '. + 'VALUES '. + '(?, ?, ?) '. + 'ON DUPLICATE KEY UPDATE '. + 'similarity = ?', + 'iidd', + $submissionId2, $submissionId1, $similarity, + $similarity + ); + } } ?> diff --git a/questtypes/submit/html/submission.tpl b/questtypes/submit/html/submission.tpl index 40618b45..72e6ed77 100644 --- a/questtypes/submit/html/submission.tpl +++ b/questtypes/submit/html/submission.tpl @@ -16,6 +16,24 @@ + +

+ +
    +
  • +

    : format($similar['similarity'])?>

    +

    +

    + + , + + , + format(new \DateTime($similar['created']))?> format(new \DateTime($similar['created']))?> +

    +
  • +
+ + @@ -24,10 +42,17 @@
-
-
- +
+ +
+ +
+ diff --git a/www/css/desktop.css b/www/css/desktop.css index 4704e82b..8ebf13a9 100644 --- a/www/css/desktop.css +++ b/www/css/desktop.css @@ -12,6 +12,7 @@ img{border:0} h1,h2,h3{color:#103a3e} h2{font-size:120%;margin-top:25px} h3{font-size:100%} +h4{margin-bottom:0} ul,ol,nav{padding:0;margin-top:0;list-style-type:none} p{margin:0 0 16px;padding:0} audio,canvas,video{display:inline-block}