merge branch ?submit-similarity?

This commit is contained in:
oliver 2016-04-09 16:36:55 +02:00
commit 1925547cf4
7 changed files with 632 additions and 28 deletions

228
app/lib/Similarity.inc Normal file
View file

@ -0,0 +1,228 @@
<?php
/**
* The Legend of Z
*
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
* @copyright 2014 Heinrich-Heine-Universität Düsseldorf
* @license http://www.gnu.org/licenses/gpl.html
* @link https://bitbucket.org/coderkun/the-legend-of-z
*/
namespace hhu\z\lib;
/**
* Class to calculate similarity between documents.
*
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
*/
class Similarity
{
/**
* Read a file and return its text.
*
* Currently only PDF-files are supported and “pdftotext” needs to be
* installed. If reading fails, false is returned.
*
* @param string $filename Name of file to read
* @return mixed Text of document (string) or false (boolean)
*/
public static function readDocument($filename)
{
if(!file_exists($filename)) {
return false;
}
$text = array();
$result = 0;
exec(sprintf('pdftotext "%s" -', $filename), $text, $result);
if($result != 0) {
return false;
}
$text = mb_strtolower(implode('', $text));
return $text;
}
/**
* Split a text into N-grams.
*
* The default N is 3.
*
* @param string $document Text to be splitted
* @param int $n Size of grams to split into (N)
* @return array List of n-grams
*/
public static function splitNgrams($document, $n=3)
{
$affix = implode(' ', array_fill(0, $n-1, ' '));
$document = $affix.$document.$affix;
$ngrams = array();
for($i=0; $i<mb_strlen($document)-$n; $i++) {
$ngrams[] = mb_substr($document, $i, $n);
}
return $ngrams;
}
/**
* Compare to documents, represented by there Term Frequencies (TFs)
* values.
*
* $tfsA, $tfsB and $idf_n are expected to be associative arrays with
* the term as key and the corresponding frequency as value.
*
* @param array $tfsA Term Frequencies of document A
* @param array $tfsB Term Frequencies of document B
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return float Similarity value (between 0.0 and 1.0)
*/
public static function compare($tfsA, $tfsB, $idf_N, $idf_n)
{
// Create vector A
$vectorA = self::getVector($tfsA, $idf_N, $idf_n);
// Create vector B
$vectorB = self::getVector($tfsB, $idf_N, $idf_n);
// Compare vectors
$result = self::cosinus($vectorA, $vectorB);
// Return result
return $result;
}
/**
* Calculate the vector for a document based on TF and IDF.
*
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* vector is an associative array with the terms as keys and their
* corresponding values as value.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array Document vector
*/
protected static function getVector($tfs, $idf_N, $idf_n)
{
// TF * IDF
$tfidfs = self::getTFIDFs($tfs, $idf_N, $idf_n);
return $tfidfs;
}
/*
* Calculate TF*IDF values for a document.
*
* $tfs and $idf_n are expected to be associative arrays with the term
* as key and the corresponding frequency as value. The resulting
* value is an associative array with the terms as keys and their
* corresponding TF*IDF as values.
*
* @param array $tfs Term Frequencies of document
* @param int $idf_N Total count of documents in corpus
* @param array $idf_n Inverse Document Frequencies of all terms
* @return array TF*IDF values
*/
protected static function getTFIDFs($tfs, $idf_N, $idf_n)
{
$tfidfs = array();
// Calculate TF*IDF
foreach($tfs as $term => &$tf)
{
if(array_key_exists($term, $idf_n)) {
$idf = log($idf_N / $idf_n[$term], 2);
}
else {
// TODO Laplace norm: n = 1?
$idf = log($idf_N / 1, 2);
}
$tfidfs[$term] = $tf * $idf;
}
return $tfidfs;
}
/**
* Calculate cosinus similarity between two vectors.
*
* sim(a, b) = (a・b) / (||a|| * ||b||)
*
* @param array $a Vector A
* @param array $b Vector B
* @return float Similarity value (between 0.0 and 1.0)
*/
protected static function cosinus(array $a, array $b)
{
$normA = self::norm($a);
$normB = self::norm($b);
if(($normA * $normB) != 0) {
return self::dotProduct($a, $b) / ($normA * $normB);
}
else {
return 0;
}
}
/**
* Calculate the dot-product for two vectors.
*
* a・b = summation{i=1,n}(a[i] * b[i])
*
* @param array $a Vector A
* @param array $b Vector B
* @return float Dot-product
*/
protected static function dotProduct(array $a, array $b)
{
$dotProduct = 0;
$keysA = array_keys(array_filter($a));
$keysB = array_keys(array_filter($b));
$uniqueKeys = array_unique(array_merge($keysA, $keysB));
foreach($uniqueKeys as $key) {
if(!empty($a[$key]) && !empty($b[$key])) {
$dotProduct += ($a[$key] * $b[$key]);
}
}
return $dotProduct;
}
/**
* Caculate the Euclidean norm for a vector.
*
* ||x|| = sqrt(x・x) // ・ is a dot product
*
* @param array $vector Vector
* @return float Euclidean norm
*/
protected static function norm(array $vector)
{
return sqrt(self::dotProduct($vector, $vector));
}
}
?>

View file

@ -153,6 +153,10 @@
array(
'mimetype' => 'image/png',
'size' => 1048576
),
array(
'mimetype' => 'application/pdf',
'size' => 1048576
)
),
'map' => array(

View file

@ -1866,28 +1866,58 @@ CREATE TABLE `questtypes_submit_characters_comments` (
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `questtypes_submit_mimetypes`
-- Table structure for table `questtypes_submit_similarities`
--
DROP TABLE IF EXISTS `questtypes_submit_mimetypes`;
DROP TABLE IF EXISTS `questtypes_submit_similarities`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `questtypes_submit_mimetypes` (
`id` int(11) NOT NULL AUTO_INCREMENT,
CREATE TABLE `questtypes_submit_similarities` (
`submission_id1` int(11) NOT NULL,
`submission_id2` int(11) NOT NULL,
`created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`created_user_id` int(11) NOT NULL,
`seminary_id` int(11) NOT NULL,
`mimetype` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`size` int(10) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
UNIQUE KEY `mimetype` (`mimetype`,`seminary_id`),
KEY `created_user_id` (`created_user_id`),
KEY `seminary_id` (`seminary_id`),
CONSTRAINT `questtypes_submit_mimetypes_ibfk_1` FOREIGN KEY (`created_user_id`) REFERENCES `users` (`id`),
CONSTRAINT `questtypes_submit_mimetypes_ibfk_2` FOREIGN KEY (`seminary_id`) REFERENCES `seminaries` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
`similarity` decimal(10,9) NOT NULL,
PRIMARY KEY (`submission_id1`,`submission_id2`),
KEY `submission_id2` (`submission_id2`),
CONSTRAINT `questtypes_submit_similarities_ibfk_1` FOREIGN KEY (`submission_id1`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT `questtypes_submit_similarities_ibfk_2` FOREIGN KEY (`submission_id2`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `questtypes_submit_submissions_terms`
--
DROP TABLE IF EXISTS `questtypes_submit_submissions_terms`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `questtypes_submit_submissions_terms` (
`submission_id` int(11) NOT NULL,
`term_id` int(11) NOT NULL,
`created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`tf` mediumint(8) unsigned NOT NULL DEFAULT '1',
PRIMARY KEY (`submission_id`,`term_id`),
KEY `term_id` (`term_id`),
CONSTRAINT `questtypes_submit_submissions_terms_ibfk_1` FOREIGN KEY (`submission_id`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT `questtypes_submit_submissions_terms_ibfk_2` FOREIGN KEY (`term_id`) REFERENCES `questtypes_submit_terms` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `questtypes_submit_terms`
--
DROP TABLE IF EXISTS `questtypes_submit_terms`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!40101 SET character_set_client = utf8 */;
CREATE TABLE `questtypes_submit_terms` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`term` varchar(9) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `term` (`term`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Terms/N-grams';
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `questtypes_textinput`
--
@ -2663,4 +2693,4 @@ DELIMITER ;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2016-03-26 19:13:31
-- Dump completed on 2016-04-09 13:18:45

View file

@ -24,7 +24,7 @@
*
* @var array
*/
public $models = array('quests', 'uploads', 'users');
public $models = array('quests', 'uploads', 'users', 'characters', 'questgroups');
@ -210,6 +210,27 @@
catch(\nre\exceptions\IdNotFoundException $e) {
}
}
$submission['similar'] = $this->Submit->getSimilarSubmissions(
$seminary['id'],
$quest['id'],
$character['id'],
$submission['id']
);
foreach($submission['similar'] as &$similarSubmission)
{
$similarSubmission['quest'] = $this->Quests->getQuestById(
$similarSubmission['quest_id']
);
$similarSubmission['questgroup'] = $this->Questgroups->getQuestgroupById(
$similarSubmission['quest']['questgroup_id']
);
$similarSubmission['character'] = $this->Characters->getCharacterById(
$similarSubmission['character_id']
);
$similarSubmission['upload'] = $this->Uploads->getSeminaryuploadById(
$similarSubmission['upload_id']
);
}
}
// Status

View file

@ -19,6 +19,19 @@
*/
class SubmitQuesttypeModel extends \hhu\z\models\QuesttypeModel
{
/**
* Minimum similarity value for two submissions
*
* @var float
*/
const SIMILARITY_MIN = 0.8;
/**
* Supported mimetypes
*
* @var array
*/
const mimetypes = array('application/pdf');
/**
* Required models
*
@ -81,6 +94,12 @@
$questId, $characterId, $uploadId
);
// Index submission for similarity calculation
$this->addDocument(
$this->db->getInsertId(),
ROOT.DS.\nre\configs\AppConfig::$dirs['seminaryuploads'].DS.$filename
);
return true;
}
@ -131,18 +150,20 @@
/**
* Get allowed mimetypes for uploading a file.
*
* @param int $seminaryId ID of Seminary
* @return array Allowed mimetypes
* @param int $seminaryId ID of Seminary
* @return array Allowed mimetypes
*/
public function getAllowedMimetypes($seminaryId)
{
return $this->db->query(
'SELECT id, mimetype, size '.
'FROM questtypes_submit_mimetypes '.
'WHERE seminary_id = ?',
'i',
$seminaryId
);
$mimetypes = array();
foreach(\nre\configs\AppConfig::$mimetypes['questtypes'] as $mimetype) {
if(in_array($mimetype['mimetype'], self::mimetypes)) {
$mimetypes[] = $mimetype;
}
}
return $mimetypes;
}
@ -163,6 +184,280 @@
);
}
/**
* Get similar submissions for a Character submission.
*
* @param int $seminaryId ID of Seminary
* @param int $questId ID of Quest
* @param int $characterId ID of Character
* @param int $submissionId ID of submission
* @return array List of submissions
*/
public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId)
{
// List of submissions with high similarity
$similarSubmissions = array();
// Get IDFs
$idf_N = $this->getIDF_total($seminaryId);
$idf_n = $this->getIDF_docs($seminaryId);
// Get stored TFs of submission
$tfsA = $this->getTFs($submissionId);
// Get submissions of same task
$submissions = $this->getSubmissionsForQuest(
$questId,
$characterId,
$submissionId
);
// Iterate through submissions of same task
foreach($submissions as &$submission)
{
// Check if similarity has already be calculated
if(is_null($submission['similarity']))
{
// Get stored TFs of submissions to compare to
$tfsB = $this->getTFs($submission['id']);
// Calculate similarity
$submission['similarity'] = \hhu\z\lib\Similarity::compare(
$tfsA,
$tfsB,
$idf_N,
$idf_n
);
// Save similarity
$this->setSimilarity(
$submissionId,
$submission['id'],
$submission['similarity']
);
}
// Add high simnilarities to list
if($submission['similarity'] >= self::SIMILARITY_MIN) {
$similarSubmissions[] = $submission;
}
}
return $similarSubmissions;
}
/**
* Index a submission as document.
*
* @param int $submissionId ID of submission
* @param string $filename Full file path of document to read
*/
private function addDocument($submissionId, $filename)
{
// Read document
$document = \hhu\z\lib\Similarity::readDocument($filename);
if($document === false) {
return false;
}
// Split document into terms
$terms = \hhu\z\lib\Similarity::splitNgrams($document);
// Update global values
$this->addTerms($submissionId, $terms);
}
/**
* Add terms to the corpus, stored in database.
*
* @param int $submissionId ID of submission
* @param array $terms List of (non-unique) terms
*/
private function addTerms($submissionId, $terms)
{
// Calculate IDF: n (n_term)
$uniqueTerms = array();
foreach($terms as &$term)
{
if(!in_array($term, $uniqueTerms))
{
// Add term to database
$this->db->query(
'INSERT IGNORE INTO questtypes_submit_terms '.
'(term) '.
'VALUES '.
'(?)',
's',
$term
);
$uniqueTerms[] = $term;
}
// Link term to submission
$this->db->query(
'INSERT INTO questtypes_submit_submissions_terms '.
'(submission_id, term_id, tf) '.
'SELECT ?, questtypes_submit_terms.id, 1 '.
'FROM questtypes_submit_terms '.
'WHERE term = ? '.
'ON DUPLICATE KEY UPDATE '.
'tf = tf + 1',
'is',
$submissionId,
$term
);
}
}
/**
* Get all submissions for a Quest including similarity values to the
* given submission, excluding the submissions of the given Character.
*
* @param int $questId ID of Quest
* @param int $characterId ID of Character to exclude submissions of
* @param int $submissionId ID of submission to get similarity values for
* @return array List of submissions
*/
private function getSubmissionsForQuest($questId, $characterId, $submissionId)
{
return $this->db->query(
'SELECT questtypes_submit_characters.id, questtypes_submit_characters.created, questtypes_submit_characters.quest_id, character_id, upload_id, questtypes_submit_similarities.similarity '.
'FROM questtypes_submit_characters '.
'LEFT JOIN questtypes_submit_similarities ON questtypes_submit_similarities.submission_id1 = ? AND questtypes_submit_similarities.submission_id2 = questtypes_submit_characters.id '.
'WHERE quest_id = ? AND character_id != ?',
'iii',
$submissionId,
$questId, $characterId
);
}
/**
* Get Term Frequency (TF) values for a submission.
*
* @param int $submissionId ID of submission
* @return array Associative array with term as key and frequency as value
*/
private function getTFs($submissionId)
{
// Read terms
$terms = $this->db->query(
'SELECT term, tf '.
'FROM questtypes_submit_submissions_terms '.
'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '.
'WHERE submission_id = ?',
'i',
$submissionId
);
// Convert to TFs
$tfs = array();
foreach($terms as &$term) {
$tfs[$term['term']] = $term['tf'];
}
// Return TFs
return $tfs;
}
/**
* Get total count of submissions for a Seminary.
*
* @param int $seminaryId ID of Seminary
* @return int Total count of submissions
*/
private function getIDF_total($seminaryId)
{
$data = $this->db->query(
'SELECT count(questtypes_submit_characters.id) as c '.
'FROM charactertypes '.
'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '.
'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '.
'WHERE charactertypes.seminary_id = ?',
'i',
$seminaryId
);
if(!empty($data)) {
return $data[0]['c'];
}
return 0;
}
/**
* Get count of submissions each term is in for a Seminary.
*
* @param int $seminaryId ID of Seminary
* @return array Associatve array wtih terms as keys and counts as values
*/
private function getIDF_docs($seminaryId)
{
$terms = $this->db->query(
'SELECT questtypes_submit_terms.term, count(*) AS c '.
'FROM charactertypes '.
'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '.
'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '.
'INNER JOIN questtypes_submit_submissions_terms ON questtypes_submit_submissions_terms.submission_id = questtypes_submit_characters.id '.
'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '.
'WHERE charactertypes.seminary_id = ? '.
'GROUP BY questtypes_submit_terms.term',
'i',
$seminaryId
);
$idfs = array();
foreach($terms as &$term) {
$idfs[$term['term']] = $term['c'];
}
return $idfs;
}
/**
* Save the similarity of two submissions.
*
* @param int $submissionId1 ID of submission
* @param int $submissionId2 ID of submission
* @param float $similarity Similarity of both submissions
*/
private function setSimilarity($submissionId1, $submissionId2, $similarity)
{
$this->db->query(
'INSERT INTO questtypes_submit_similarities '.
'(submission_id1, submission_id2, similarity) '.
'VALUES '.
'(?, ?, ?) '.
'ON DUPLICATE KEY UPDATE '.
'similarity = ?',
'iidd',
$submissionId1, $submissionId2, $similarity,
$similarity
);
$this->db->query(
'INSERT INTO questtypes_submit_similarities '.
'(submission_id1, submission_id2, similarity) '.
'VALUES '.
'(?, ?, ?) '.
'ON DUPLICATE KEY UPDATE '.
'similarity = ?',
'iidd',
$submissionId2, $submissionId1, $similarity,
$similarity
);
}
}
?>

View file

@ -16,6 +16,24 @@
<?php endforeach ?>
</ol>
<?php endif ?>
<?php if(!empty($submission['similar'])) : ?>
<h4><?=_('Similar submissions')?></h4>
<?php foreach($submission['similar'] as &$similar) : ?>
<ul>
<li>
<p><small><?=('Similarity')?>: <?=$numberFormatter->format($similar['similarity'])?></small></p>
<p><a href="<?=$linker->link(array('uploads','seminary',$seminary['url'], $similar['upload']['url']))?>"><?=$similar['upload']['name']?></a></p>
<p><small>
<a href="<?=$linker->link(array('quests','submission',$seminary['url'],$similar['questgroup']['url'],$similar['quest']['url'],$similar['character']['url']))?>">
<?=$similar['character']['name']?>,
<?=$similar['quest']['title']?>
</a>,
<?=$dateFormatter->format(new \DateTime($similar['created']))?> <?=$timeFormatter->format(new \DateTime($similar['created']))?>
</small></p>
</li>
</ul>
<?php endforeach ?>
<?php endif ?>
</li>
<?php endforeach ?>
</ol>
@ -24,10 +42,17 @@
<form method="post" class="logreg">
<?php $submission = array_pop($submissions); ?>
<?php if(!$solved) : ?>
<?=_('Comment')?><br />
<textarea name="characterdata[comment]"></textarea><br />
<input type="hidden" name="characterdata[submission_id]" value="<?=$submission['id']?>" />
<fieldset>
<legend><?=_('Comment')?></legend>
<textarea id="characterdata-comment" name="characterdata[comment]"></textarea><br />
<input type="hidden" name="characterdata[submission_id]" value="<?=$submission['id']?>" />
</fieldset>
<input type="submit" name="submit" value="<?=_('solved')?>" />
<input type="submit" name="submit" value="<?=_('unsolved')?>" />
<?php endif ?>
</form>
<script>
$(function() {
$("#characterdata-comment").markItUp(mySettings);
});
</script>

View file

@ -12,6 +12,7 @@ img{border:0}
h1,h2,h3{color:#103a3e}
h2{font-size:120%;margin-top:25px}
h3{font-size:100%}
h4{margin-bottom:0}
ul,ol,nav{padding:0;margin-top:0;list-style-type:none}
p{margin:0 0 16px;padding:0}
audio,canvas,video{display:inline-block}