diff --git a/app/lib/Similarity.inc b/app/lib/Similarity.inc new file mode 100644 index 00000000..fa27c986 --- /dev/null +++ b/app/lib/Similarity.inc @@ -0,0 +1,228 @@ + + * @copyright 2014 Heinrich-Heine-Universität Düsseldorf + * @license http://www.gnu.org/licenses/gpl.html + * @link https://bitbucket.org/coderkun/the-legend-of-z + */ + + namespace hhu\z\lib; + + + /** + * Class to calculate similarity between documents. + * + * @author Oliver Hanraths + */ + class Similarity + { + + + /** + * Read a file and return its text. + * + * Currently only PDF-files are supported and “pdftotext” needs to be + * installed. If reading fails, false is returned. + * + * @param string $filename Name of file to read + * @return mixed Text of document (string) or false (boolean) + */ + public static function readDocument($filename) + { + if(!file_exists($filename)) { + return false; + } + + $text = array(); + $result = 0; + exec(sprintf('pdftotext "%s" -', $filename), $text, $result); + if($result != 0) { + return false; + } + + $text = mb_strtolower(implode('', $text)); + + + return $text; + } + + + /** + * Split a text into N-grams. + * + * The default N is 3. + * + * @param string $document Text to be splitted + * @param int $n Size of grams to split into (N) + * @return array List of n-grams + */ + public static function splitNgrams($document, $n=3) + { + $affix = implode(' ', array_fill(0, $n-1, ' ')); + $document = $affix.$document.$affix; + $ngrams = array(); + for($i=0; $i &$tf) + { + if(array_key_exists($term, $idf_n)) { + $idf = log($idf_N / $idf_n[$term], 2); + } + else { + // TODO Laplace norm: n = 1? + $idf = log($idf_N / 1, 2); + } + $tfidfs[$term] = $tf * $idf; + } + + + return $tfidfs; + } + + + /** + * Calculate cosinus similarity between two vectors. + * + * sim(a, b) = (a・b) / (||a|| * ||b||) + * + * @param array $a Vector A + * @param array $b Vector B + * @return float Similarity value (between 0.0 and 1.0) + */ + protected static function cosinus(array $a, array $b) + { + $normA = self::norm($a); + $normB = self::norm($b); + if(($normA * $normB) != 0) { + return self::dotProduct($a, $b) / ($normA * $normB); + } + else { + return 0; + } + } + + + /** + * Calculate the dot-product for two vectors. + * + * a・b = summation{i=1,n}(a[i] * b[i]) + * + * @param array $a Vector A + * @param array $b Vector B + * @return float Dot-product + */ + protected static function dotProduct(array $a, array $b) + { + $dotProduct = 0; + $keysA = array_keys(array_filter($a)); + $keysB = array_keys(array_filter($b)); + $uniqueKeys = array_unique(array_merge($keysA, $keysB)); + foreach($uniqueKeys as $key) { + if(!empty($a[$key]) && !empty($b[$key])) { + $dotProduct += ($a[$key] * $b[$key]); + } + } + + return $dotProduct; + } + + + /** + * Caculate the Euclidean norm for a vector. + * + * ||x|| = sqrt(x・x) // ・ is a dot product + * + * @param array $vector Vector + * @return float Euclidean norm + */ + protected static function norm(array $vector) + { + return sqrt(self::dotProduct($vector, $vector)); + } + + } + +?> diff --git a/configs/AppConfig.inc b/configs/AppConfig.inc index a9b6f7f3..4c3ab885 100644 --- a/configs/AppConfig.inc +++ b/configs/AppConfig.inc @@ -153,6 +153,10 @@ array( 'mimetype' => 'image/png', 'size' => 1048576 + ), + array( + 'mimetype' => 'application/pdf', + 'size' => 1048576 ) ), 'map' => array( diff --git a/db/create.sql b/db/create.sql index d695be29..b6c1edc6 100644 --- a/db/create.sql +++ b/db/create.sql @@ -1866,28 +1866,58 @@ CREATE TABLE `questtypes_submit_characters_comments` ( /*!40101 SET character_set_client = @saved_cs_client */; -- --- Table structure for table `questtypes_submit_mimetypes` +-- Table structure for table `questtypes_submit_similarities` -- -DROP TABLE IF EXISTS `questtypes_submit_mimetypes`; +DROP TABLE IF EXISTS `questtypes_submit_similarities`; /*!40101 SET @saved_cs_client = @@character_set_client */; /*!40101 SET character_set_client = utf8 */; -CREATE TABLE `questtypes_submit_mimetypes` ( - `id` int(11) NOT NULL AUTO_INCREMENT, +CREATE TABLE `questtypes_submit_similarities` ( + `submission_id1` int(11) NOT NULL, + `submission_id2` int(11) NOT NULL, `created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - `created_user_id` int(11) NOT NULL, - `seminary_id` int(11) NOT NULL, - `mimetype` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, - `size` int(10) unsigned NOT NULL DEFAULT '0', - PRIMARY KEY (`id`), - UNIQUE KEY `mimetype` (`mimetype`,`seminary_id`), - KEY `created_user_id` (`created_user_id`), - KEY `seminary_id` (`seminary_id`), - CONSTRAINT `questtypes_submit_mimetypes_ibfk_1` FOREIGN KEY (`created_user_id`) REFERENCES `users` (`id`), - CONSTRAINT `questtypes_submit_mimetypes_ibfk_2` FOREIGN KEY (`seminary_id`) REFERENCES `seminaries` (`id`) ON DELETE CASCADE ON UPDATE CASCADE + `similarity` decimal(10,9) NOT NULL, + PRIMARY KEY (`submission_id1`,`submission_id2`), + KEY `submission_id2` (`submission_id2`), + CONSTRAINT `questtypes_submit_similarities_ibfk_1` FOREIGN KEY (`submission_id1`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + CONSTRAINT `questtypes_submit_similarities_ibfk_2` FOREIGN KEY (`submission_id2`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; /*!40101 SET character_set_client = @saved_cs_client */; +-- +-- Table structure for table `questtypes_submit_submissions_terms` +-- + +DROP TABLE IF EXISTS `questtypes_submit_submissions_terms`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `questtypes_submit_submissions_terms` ( + `submission_id` int(11) NOT NULL, + `term_id` int(11) NOT NULL, + `created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `tf` mediumint(8) unsigned NOT NULL DEFAULT '1', + PRIMARY KEY (`submission_id`,`term_id`), + KEY `term_id` (`term_id`), + CONSTRAINT `questtypes_submit_submissions_terms_ibfk_1` FOREIGN KEY (`submission_id`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, + CONSTRAINT `questtypes_submit_submissions_terms_ibfk_2` FOREIGN KEY (`term_id`) REFERENCES `questtypes_submit_terms` (`id`) ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `questtypes_submit_terms` +-- + +DROP TABLE IF EXISTS `questtypes_submit_terms`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `questtypes_submit_terms` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `term` varchar(9) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `term` (`term`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Terms/N-grams'; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `questtypes_textinput` -- @@ -2663,4 +2693,4 @@ DELIMITER ; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2016-03-26 19:13:31 +-- Dump completed on 2016-04-09 13:18:45 diff --git a/questtypes/submit/SubmitQuesttypeController.inc b/questtypes/submit/SubmitQuesttypeController.inc index d634e096..5cc17f0a 100644 --- a/questtypes/submit/SubmitQuesttypeController.inc +++ b/questtypes/submit/SubmitQuesttypeController.inc @@ -24,7 +24,7 @@ * * @var array */ - public $models = array('quests', 'uploads', 'users'); + public $models = array('quests', 'uploads', 'users', 'characters', 'questgroups'); @@ -210,6 +210,27 @@ catch(\nre\exceptions\IdNotFoundException $e) { } } + $submission['similar'] = $this->Submit->getSimilarSubmissions( + $seminary['id'], + $quest['id'], + $character['id'], + $submission['id'] + ); + foreach($submission['similar'] as &$similarSubmission) + { + $similarSubmission['quest'] = $this->Quests->getQuestById( + $similarSubmission['quest_id'] + ); + $similarSubmission['questgroup'] = $this->Questgroups->getQuestgroupById( + $similarSubmission['quest']['questgroup_id'] + ); + $similarSubmission['character'] = $this->Characters->getCharacterById( + $similarSubmission['character_id'] + ); + $similarSubmission['upload'] = $this->Uploads->getSeminaryuploadById( + $similarSubmission['upload_id'] + ); + } } // Status diff --git a/questtypes/submit/SubmitQuesttypeModel.inc b/questtypes/submit/SubmitQuesttypeModel.inc index b3b8f7bb..7b829511 100644 --- a/questtypes/submit/SubmitQuesttypeModel.inc +++ b/questtypes/submit/SubmitQuesttypeModel.inc @@ -19,6 +19,19 @@ */ class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel { + /** + * Minimum similarity value for two submissions + * + * @var float + */ + const SIMILARITY_MIN = 0.8; + /** + * Supported mimetypes + * + * @var array + */ + const mimetypes = array('application/pdf'); + /** * Required models * @@ -81,6 +94,12 @@ $questId, $characterId, $uploadId ); + // Index submission for similarity calculation + $this->addDocument( + $this->db->getInsertId(), + ROOT.DS.\nre\configs\AppConfig::$dirs['seminaryuploads'].DS.$filename + ); + return true; } @@ -131,18 +150,20 @@ /** * Get allowed mimetypes for uploading a file. * - * @param int $seminaryId ID of Seminary - * @return array Allowed mimetypes + * @param int $seminaryId ID of Seminary + * @return array Allowed mimetypes */ public function getAllowedMimetypes($seminaryId) { - return $this->db->query( - 'SELECT id, mimetype, size '. - 'FROM questtypes_submit_mimetypes '. - 'WHERE seminary_id = ?', - 'i', - $seminaryId - ); + $mimetypes = array(); + foreach(\nre\configs\AppConfig::$mimetypes['questtypes'] as $mimetype) { + if(in_array($mimetype['mimetype'], self::mimetypes)) { + $mimetypes[] = $mimetype; + } + } + + + return $mimetypes; } @@ -163,6 +184,280 @@ ); } + + /** + * Get similar submissions for a Character submission. + * + * @param int $seminaryId ID of Seminary + * @param int $questId ID of Quest + * @param int $characterId ID of Character + * @param int $submissionId ID of submission + * @return array List of submissions + */ + public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId) + { + // List of submissions with high similarity + $similarSubmissions = array(); + + // Get IDFs + $idf_N = $this->getIDF_total($seminaryId); + $idf_n = $this->getIDF_docs($seminaryId); + + // Get stored TFs of submission + $tfsA = $this->getTFs($submissionId); + + // Get submissions of same task + $submissions = $this->getSubmissionsForQuest( + $questId, + $characterId, + $submissionId + ); + + // Iterate through submissions of same task + foreach($submissions as &$submission) + { + // Check if similarity has already be calculated + if(is_null($submission['similarity'])) + { + // Get stored TFs of submissions to compare to + $tfsB = $this->getTFs($submission['id']); + + // Calculate similarity + $submission['similarity'] = \hhu\z\lib\Similarity::compare( + $tfsA, + $tfsB, + $idf_N, + $idf_n + ); + + // Save similarity + $this->setSimilarity( + $submissionId, + $submission['id'], + $submission['similarity'] + ); + } + + // Add high simnilarities to list + if($submission['similarity'] >= self::SIMILARITY_MIN) { + $similarSubmissions[] = $submission; + } + } + + + return $similarSubmissions; + } + + + + + /** + * Index a submission as document. + * + * @param int $submissionId ID of submission + * @param string $filename Full file path of document to read + */ + private function addDocument($submissionId, $filename) + { + // Read document + $document = \hhu\z\lib\Similarity::readDocument($filename); + if($document === false) { + return false; + } + + // Split document into terms + $terms = \hhu\z\lib\Similarity::splitNgrams($document); + + // Update global values + $this->addTerms($submissionId, $terms); + } + + + /** + * Add terms to the corpus, stored in database. + * + * @param int $submissionId ID of submission + * @param array $terms List of (non-unique) terms + */ + private function addTerms($submissionId, $terms) + { + // Calculate IDF: n (n_term) + $uniqueTerms = array(); + foreach($terms as &$term) + { + if(!in_array($term, $uniqueTerms)) + { + // Add term to database + $this->db->query( + 'INSERT IGNORE INTO questtypes_submit_terms '. + '(term) '. + 'VALUES '. + '(?)', + 's', + $term + ); + $uniqueTerms[] = $term; + } + + // Link term to submission + $this->db->query( + 'INSERT INTO questtypes_submit_submissions_terms '. + '(submission_id, term_id, tf) '. + 'SELECT ?, questtypes_submit_terms.id, 1 '. + 'FROM questtypes_submit_terms '. + 'WHERE term = ? '. + 'ON DUPLICATE KEY UPDATE '. + 'tf = tf + 1', + 'is', + $submissionId, + $term + ); + } + } + + + /** + * Get all submissions for a Quest including similarity values to the + * given submission, excluding the submissions of the given Character. + * + * @param int $questId ID of Quest + * @param int $characterId ID of Character to exclude submissions of + * @param int $submissionId ID of submission to get similarity values for + * @return array List of submissions + */ + private function getSubmissionsForQuest($questId, $characterId, $submissionId) + { + return $this->db->query( + 'SELECT questtypes_submit_characters.id, questtypes_submit_characters.created, questtypes_submit_characters.quest_id, character_id, upload_id, questtypes_submit_similarities.similarity '. + 'FROM questtypes_submit_characters '. + 'LEFT JOIN questtypes_submit_similarities ON questtypes_submit_similarities.submission_id1 = ? AND questtypes_submit_similarities.submission_id2 = questtypes_submit_characters.id '. + 'WHERE quest_id = ? AND character_id != ?', + 'iii', + $submissionId, + $questId, $characterId + ); + } + + + /** + * Get Term Frequency (TF) values for a submission. + * + * @param int $submissionId ID of submission + * @return array Associative array with term as key and frequency as value + */ + private function getTFs($submissionId) + { + // Read terms + $terms = $this->db->query( + 'SELECT term, tf '. + 'FROM questtypes_submit_submissions_terms '. + 'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '. + 'WHERE submission_id = ?', + 'i', + $submissionId + ); + + // Convert to TFs + $tfs = array(); + foreach($terms as &$term) { + $tfs[$term['term']] = $term['tf']; + } + + + // Return TFs + return $tfs; + } + + + /** + * Get total count of submissions for a Seminary. + * + * @param int $seminaryId ID of Seminary + * @return int Total count of submissions + */ + private function getIDF_total($seminaryId) + { + $data = $this->db->query( + 'SELECT count(questtypes_submit_characters.id) as c '. + 'FROM charactertypes '. + 'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '. + 'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '. + 'WHERE charactertypes.seminary_id = ?', + 'i', + $seminaryId + ); + if(!empty($data)) { + return $data[0]['c']; + } + + + return 0; + } + + + /** + * Get count of submissions each term is in for a Seminary. + * + * @param int $seminaryId ID of Seminary + * @return array Associatve array wtih terms as keys and counts as values + */ + private function getIDF_docs($seminaryId) + { + $terms = $this->db->query( + 'SELECT questtypes_submit_terms.term, count(*) AS c '. + 'FROM charactertypes '. + 'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '. + 'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '. + 'INNER JOIN questtypes_submit_submissions_terms ON questtypes_submit_submissions_terms.submission_id = questtypes_submit_characters.id '. + 'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '. + 'WHERE charactertypes.seminary_id = ? '. + 'GROUP BY questtypes_submit_terms.term', + 'i', + $seminaryId + ); + + $idfs = array(); + foreach($terms as &$term) { + $idfs[$term['term']] = $term['c']; + } + + + return $idfs; + } + + + /** + * Save the similarity of two submissions. + * + * @param int $submissionId1 ID of submission + * @param int $submissionId2 ID of submission + * @param float $similarity Similarity of both submissions + */ + private function setSimilarity($submissionId1, $submissionId2, $similarity) + { + $this->db->query( + 'INSERT INTO questtypes_submit_similarities '. + '(submission_id1, submission_id2, similarity) '. + 'VALUES '. + '(?, ?, ?) '. + 'ON DUPLICATE KEY UPDATE '. + 'similarity = ?', + 'iidd', + $submissionId1, $submissionId2, $similarity, + $similarity + ); + $this->db->query( + 'INSERT INTO questtypes_submit_similarities '. + '(submission_id1, submission_id2, similarity) '. + 'VALUES '. + '(?, ?, ?) '. + 'ON DUPLICATE KEY UPDATE '. + 'similarity = ?', + 'iidd', + $submissionId2, $submissionId1, $similarity, + $similarity + ); + } } ?> diff --git a/questtypes/submit/html/submission.tpl b/questtypes/submit/html/submission.tpl index 40618b45..72e6ed77 100644 --- a/questtypes/submit/html/submission.tpl +++ b/questtypes/submit/html/submission.tpl @@ -16,6 +16,24 @@ + +

+ + + + @@ -24,10 +42,17 @@
-
-
- +
+ +
+ +
+ diff --git a/www/css/desktop.css b/www/css/desktop.css index 4704e82b..8ebf13a9 100644 --- a/www/css/desktop.css +++ b/www/css/desktop.css @@ -12,6 +12,7 @@ img{border:0} h1,h2,h3{color:#103a3e} h2{font-size:120%;margin-top:25px} h3{font-size:100%} +h4{margin-bottom:0} ul,ol,nav{padding:0;margin-top:0;list-style-type:none} p{margin:0 0 16px;padding:0} audio,canvas,video{display:inline-block}