implement similarity algorithm for questtype ?Submit?
This commit is contained in:
parent
67f92d6174
commit
53fda5caaf
6 changed files with 533 additions and 5 deletions
183
app/lib/Similarity.inc
Normal file
183
app/lib/Similarity.inc
Normal file
|
@ -0,0 +1,183 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* The Legend of Z
|
||||
*
|
||||
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
|
||||
* @copyright 2014 Heinrich-Heine-Universität Düsseldorf
|
||||
* @license http://www.gnu.org/licenses/gpl.html
|
||||
* @link https://bitbucket.org/coderkun/the-legend-of-z
|
||||
*/
|
||||
|
||||
namespace hhu\z\lib;
|
||||
|
||||
|
||||
/**
|
||||
* Class to calculate similarity between documents.
|
||||
*
|
||||
* @author Oliver Hanraths <oliver.hanraths@uni-duesseldorf.de>
|
||||
*/
|
||||
class Similarity
|
||||
{
|
||||
|
||||
|
||||
/**
|
||||
* TODO readDocument()
|
||||
* used
|
||||
*/
|
||||
public static function readDocument($filename)
|
||||
{
|
||||
if(!file_exists($filename)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$text = array();
|
||||
$result = 0;
|
||||
exec(sprintf('pdftotext "%s" -', $filename), $text, $result);
|
||||
if($result != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$text = mb_strtolower(implode('', $text));
|
||||
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO splitNgrams()
|
||||
* used
|
||||
*/
|
||||
public static function splitNgrams($document)
|
||||
{
|
||||
$n = 3;
|
||||
$affix = implode(' ', array_fill(0, $n-1, ' '));
|
||||
$document = $affix.$document.$affix;
|
||||
$ngrams = array();
|
||||
for($i=0; $i<mb_strlen($document)-$n; $i++) {
|
||||
$ngrams[] = mb_substr($document, $i, $n);
|
||||
}
|
||||
|
||||
|
||||
return $ngrams;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO compare()
|
||||
* used
|
||||
*/
|
||||
public static function compare($tfsA, $tfsB, $idf_N, $idf_n)
|
||||
{
|
||||
// Create vector A
|
||||
$vectorA = self::getVector($tfsA, $idf_N, $idf_n);
|
||||
|
||||
// Create vector B
|
||||
$vectorB = self::getVector($tfsB, $idf_N, $idf_n);
|
||||
|
||||
// Compare vectors
|
||||
$result = self::cosinus($vectorA, $vectorB);
|
||||
|
||||
|
||||
// Return result
|
||||
return $result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* TODO getVector()
|
||||
* used
|
||||
*/
|
||||
protected static function getVector($tfs, $idf_N, $idf_n)
|
||||
{
|
||||
// TF * IDF
|
||||
$tfidfs = self::getTFIDFs($tfs, $idf_N, $idf_n);
|
||||
|
||||
|
||||
return $tfidfs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO getTFIDFs()
|
||||
* used
|
||||
*/
|
||||
protected static function getTFIDFs($tfs, $idf_N, $idf_n)
|
||||
{
|
||||
$tfidfs = array();
|
||||
|
||||
// Calculate TF*IDF
|
||||
foreach($tfs as $term => &$tf)
|
||||
{
|
||||
if(array_key_exists($term, $idf_n)) {
|
||||
$idf = log($idf_N / $idf_n[$term], 2);
|
||||
}
|
||||
else {
|
||||
// TODO Laplace norm: n = 1?
|
||||
$idf = log($idf_N / 1, 2);
|
||||
}
|
||||
$tfidfs[$term] = $tf * $idf;
|
||||
}
|
||||
|
||||
|
||||
return $tfidfs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO cosinus()
|
||||
* sim(a, b) = (a・b) / (||a|| * ||b||)
|
||||
* used
|
||||
*/
|
||||
protected static function cosinus(array $a, array $b)
|
||||
{
|
||||
$normA = self::norm($a);
|
||||
$normB = self::norm($b);
|
||||
if(($normA * $normB) != 0) {
|
||||
return self::dotProduct($a, $b) / ($normA * $normB);
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO Dot product
|
||||
* a・b = summation{i=1,n}(a[i] * b[i])
|
||||
* used
|
||||
*/
|
||||
protected static function dotProduct(array $a, array $b)
|
||||
{
|
||||
$dotProduct = 0;
|
||||
$keysA = array_keys(array_filter($a));
|
||||
$keysB = array_keys(array_filter($b));
|
||||
$uniqueKeys = array_unique(array_merge($keysA, $keysB));
|
||||
foreach($uniqueKeys as $key)
|
||||
{
|
||||
if(!empty($a[$key]) && !empty($b[$key]))
|
||||
{
|
||||
$dotProduct += ($a[$key] * $b[$key]);
|
||||
}
|
||||
}
|
||||
|
||||
return $dotProduct;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO Euclidean norm
|
||||
* ||x|| = sqrt(x・x) // ・ is a dot product
|
||||
* used
|
||||
*/
|
||||
protected static function norm(array $vector)
|
||||
{
|
||||
return sqrt(self::dotProduct($vector, $vector));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
|
@ -1888,6 +1888,59 @@ CREATE TABLE `questtypes_submit_mimetypes` (
|
|||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `questtypes_submit_similarities`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `questtypes_submit_similarities`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `questtypes_submit_similarities` (
|
||||
`submission_id1` int(11) NOT NULL,
|
||||
`submission_id2` int(11) NOT NULL,
|
||||
`created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`similarity` decimal(10,9) NOT NULL,
|
||||
PRIMARY KEY (`submission_id1`,`submission_id2`),
|
||||
KEY `submission_id2` (`submission_id2`),
|
||||
CONSTRAINT `questtypes_submit_similarities_ibfk_1` FOREIGN KEY (`submission_id1`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
|
||||
CONSTRAINT `questtypes_submit_similarities_ibfk_2` FOREIGN KEY (`submission_id2`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `questtypes_submit_submissions_terms`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `questtypes_submit_submissions_terms`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `questtypes_submit_submissions_terms` (
|
||||
`submission_id` int(11) NOT NULL,
|
||||
`term_id` int(11) NOT NULL,
|
||||
`created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`tf` mediumint(8) unsigned NOT NULL DEFAULT '1',
|
||||
PRIMARY KEY (`submission_id`,`term_id`),
|
||||
KEY `term_id` (`term_id`),
|
||||
CONSTRAINT `questtypes_submit_submissions_terms_ibfk_1` FOREIGN KEY (`submission_id`) REFERENCES `questtypes_submit_characters` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
|
||||
CONSTRAINT `questtypes_submit_submissions_terms_ibfk_2` FOREIGN KEY (`term_id`) REFERENCES `questtypes_submit_terms` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `questtypes_submit_terms`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `questtypes_submit_terms`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!40101 SET character_set_client = utf8 */;
|
||||
CREATE TABLE `questtypes_submit_terms` (
|
||||
`id` int(11) NOT NULL AUTO_INCREMENT,
|
||||
`term` varchar(9) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `term` (`term`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Terms/N-grams';
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `questtypes_textinput`
|
||||
--
|
||||
|
@ -2663,4 +2716,4 @@ DELIMITER ;
|
|||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
||||
|
||||
-- Dump completed on 2016-03-26 19:13:31
|
||||
-- Dump completed on 2016-04-09 13:18:45
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
*
|
||||
* @var array
|
||||
*/
|
||||
public $models = array('quests', 'uploads', 'users');
|
||||
public $models = array('quests', 'uploads', 'users', 'characters', 'questgroups');
|
||||
|
||||
|
||||
|
||||
|
@ -210,6 +210,27 @@
|
|||
catch(\nre\exceptions\IdNotFoundException $e) {
|
||||
}
|
||||
}
|
||||
$submission['similar'] = $this->Submit->getSimilarSubmissions(
|
||||
$seminary['id'],
|
||||
$quest['id'],
|
||||
$character['id'],
|
||||
$submission['id']
|
||||
);
|
||||
foreach($submission['similar'] as &$similarSubmission)
|
||||
{
|
||||
$similarSubmission['quest'] = $this->Quests->getQuestById(
|
||||
$similarSubmission['quest_id']
|
||||
);
|
||||
$similarSubmission['questgroup'] = $this->Questgroups->getQuestgroupById(
|
||||
$similarSubmission['quest']['questgroup_id']
|
||||
);
|
||||
$similarSubmission['character'] = $this->Characters->getCharacterById(
|
||||
$similarSubmission['character_id']
|
||||
);
|
||||
$similarSubmission['upload'] = $this->Uploads->getSeminaryuploadById(
|
||||
$similarSubmission['upload_id']
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Status
|
||||
|
|
|
@ -81,6 +81,12 @@
|
|||
$questId, $characterId, $uploadId
|
||||
);
|
||||
|
||||
// Index submission for similarity calculation
|
||||
$this->addDocument(
|
||||
$this->db->getInsertId(),
|
||||
ROOT.DS.\nre\configs\AppConfig::$dirs['seminaryuploads'].DS.$filename
|
||||
);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -163,6 +169,245 @@
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO getSimilarSubmissions()
|
||||
*/
|
||||
public function getSimilarSubmissions($seminaryId, $questId, $characterId, $submissionId)
|
||||
{
|
||||
// List of submissions with high similarity
|
||||
$similarSubmissions = array();
|
||||
|
||||
// Get IDFs
|
||||
$idf_N = $this->getIDF_total($seminaryId);
|
||||
$idf_n = $this->getIDF_docs($seminaryId);
|
||||
|
||||
// Get stored TFs of submission
|
||||
$tfsA = $this->getTFs($submissionId);
|
||||
|
||||
// Iterate through submissions of same task
|
||||
$submissions = $this->getSubmissionsForQuest(
|
||||
$questId,
|
||||
$characterId,
|
||||
$submissionId
|
||||
);
|
||||
foreach($submissions as &$submission)
|
||||
{
|
||||
if(is_null($submission['similarity']))
|
||||
{
|
||||
// Get stored TFs of submissions to compare to
|
||||
$tfsB = $this->getTFs($submission['id']);
|
||||
|
||||
// Calculate similarity
|
||||
$submission['similarity'] = \hhu\z\lib\Similarity::compare(
|
||||
$tfsA,
|
||||
$tfsB,
|
||||
$idf_N,
|
||||
$idf_n
|
||||
);
|
||||
|
||||
// Save similarity
|
||||
$this->setSimilarity(
|
||||
$submissionId,
|
||||
$submission['id'],
|
||||
$submission['similarity']
|
||||
);
|
||||
}
|
||||
|
||||
// Add high simnilarities to list
|
||||
if($submission['similarity'] >= 0.7) {
|
||||
$similarSubmissions[] = $submission;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return $similarSubmissions;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* TODO addDocument()
|
||||
*/
|
||||
private function addDocument($submissionId, $filename)
|
||||
{
|
||||
// Read document
|
||||
$document = \hhu\z\lib\Similarity::readDocument($filename);
|
||||
if($document === false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split document into terms
|
||||
$terms = \hhu\z\lib\Similarity::splitNgrams($document);
|
||||
|
||||
// Update global values
|
||||
$this->addTerms($submissionId, $terms);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO addTerms()
|
||||
*/
|
||||
private function addTerms($submissionId, $terms)
|
||||
{
|
||||
// Calculate IDF: n (n_term)
|
||||
$uniqueTerms = array();
|
||||
foreach($terms as &$term)
|
||||
{
|
||||
if(!in_array($term, $uniqueTerms))
|
||||
{
|
||||
// Add term to database
|
||||
$this->db->query(
|
||||
'INSERT IGNORE INTO questtypes_submit_terms '.
|
||||
'(term) '.
|
||||
'VALUES '.
|
||||
'(?)',
|
||||
's',
|
||||
$term
|
||||
);
|
||||
$uniqueTerms[] = $term;
|
||||
}
|
||||
|
||||
// Link term to submission
|
||||
$this->db->query(
|
||||
'INSERT INTO questtypes_submit_submissions_terms '.
|
||||
'(submission_id, term_id, tf) '.
|
||||
'SELECT ?, questtypes_submit_terms.id, 1 '.
|
||||
'FROM questtypes_submit_terms '.
|
||||
'WHERE term = ? '.
|
||||
'ON DUPLICATE KEY UPDATE '.
|
||||
'tf = tf + 1',
|
||||
'is',
|
||||
$submissionId,
|
||||
$term
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private function getSubmissionsForQuest($questId, $characterId, $submissionId)
|
||||
{
|
||||
return $this->db->query(
|
||||
'SELECT questtypes_submit_characters.id, questtypes_submit_characters.created, questtypes_submit_characters.quest_id, character_id, upload_id, questtypes_submit_similarities.similarity '.
|
||||
'FROM questtypes_submit_characters '.
|
||||
'LEFT JOIN questtypes_submit_similarities ON questtypes_submit_similarities.submission_id1 = ? AND questtypes_submit_similarities.submission_id2 = questtypes_submit_characters.id '.
|
||||
'WHERE quest_id = ? AND character_id != ?',
|
||||
'iii',
|
||||
$submissionId,
|
||||
$questId, $characterId
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO getTFs()
|
||||
*/
|
||||
private function getTFs($submissionId)
|
||||
{
|
||||
// Read terms
|
||||
$terms = $this->db->query(
|
||||
'SELECT term, tf '.
|
||||
'FROM questtypes_submit_submissions_terms '.
|
||||
'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '.
|
||||
'WHERE submission_id = ?',
|
||||
'i',
|
||||
$submissionId
|
||||
);
|
||||
|
||||
// Convert to TFs
|
||||
$tfs = array();
|
||||
foreach($terms as &$term) {
|
||||
$tfs[$term['term']] = $term['tf'];
|
||||
}
|
||||
|
||||
|
||||
// Return TFs
|
||||
return $tfs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO getIDF_N()
|
||||
* Total count of submissions (per Seminary)
|
||||
*/
|
||||
private function getIDF_total($seminaryId)
|
||||
{
|
||||
$data = $this->db->query(
|
||||
'SELECT count(questtypes_submit_characters.id) as c '.
|
||||
'FROM charactertypes '.
|
||||
'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '.
|
||||
'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '.
|
||||
'WHERE charactertypes.seminary_id = ?',
|
||||
'i',
|
||||
$seminaryId
|
||||
);
|
||||
if(!empty($data)) {
|
||||
return $data[0]['c'];
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO getIDF_n()
|
||||
* Count of submissions each term is in (per Seminary)
|
||||
*/
|
||||
private function getIDF_docs($seminaryId)
|
||||
{
|
||||
$terms = $this->db->query(
|
||||
'SELECT questtypes_submit_terms.term, count(*) AS c '.
|
||||
'FROM charactertypes '.
|
||||
'INNER JOIN characters ON characters.charactertype_id = charactertypes.id '.
|
||||
'INNER JOIN questtypes_submit_characters ON questtypes_submit_characters.character_id = characters.id '.
|
||||
'INNER JOIN questtypes_submit_submissions_terms ON questtypes_submit_submissions_terms.submission_id = questtypes_submit_characters.id '.
|
||||
'INNER JOIN questtypes_submit_terms ON questtypes_submit_terms.id = questtypes_submit_submissions_terms.term_id '.
|
||||
'WHERE charactertypes.seminary_id = ? '.
|
||||
'GROUP BY questtypes_submit_terms.term',
|
||||
'i',
|
||||
$seminaryId
|
||||
);
|
||||
|
||||
$idfs = array();
|
||||
foreach($terms as &$term) {
|
||||
$idfs[$term['term']] = $term['c'];
|
||||
}
|
||||
|
||||
|
||||
return $idfs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO setSimilarity()
|
||||
*/
|
||||
private function setSimilarity($submissionId1, $submissionId2, $similarity)
|
||||
{
|
||||
$this->db->query(
|
||||
'INSERT INTO questtypes_submit_similarities '.
|
||||
'(submission_id1, submission_id2, similarity) '.
|
||||
'VALUES '.
|
||||
'(?, ?, ?) '.
|
||||
'ON DUPLICATE KEY UPDATE '.
|
||||
'similarity = ?',
|
||||
'iidd',
|
||||
$submissionId1, $submissionId2, $similarity,
|
||||
$similarity
|
||||
);
|
||||
$this->db->query(
|
||||
'INSERT INTO questtypes_submit_similarities '.
|
||||
'(submission_id1, submission_id2, similarity) '.
|
||||
'VALUES '.
|
||||
'(?, ?, ?) '.
|
||||
'ON DUPLICATE KEY UPDATE '.
|
||||
'similarity = ?',
|
||||
'iidd',
|
||||
$submissionId2, $submissionId1, $similarity,
|
||||
$similarity
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
|
@ -16,6 +16,24 @@
|
|||
<?php endforeach ?>
|
||||
</ol>
|
||||
<?php endif ?>
|
||||
<?php if(!empty($submission['similar'])) : ?>
|
||||
<h4><?=_('Similar submissions')?></h4>
|
||||
<?php foreach($submission['similar'] as &$similar) : ?>
|
||||
<ul>
|
||||
<li>
|
||||
<p><small><?=('Similarity')?>: <?=$numberFormatter->format($similar['similarity'])?></small></p>
|
||||
<p><a href="<?=$linker->link(array('uploads','seminary',$seminary['url'], $similar['upload']['url']))?>"><?=$similar['upload']['name']?></a></p>
|
||||
<p><small>
|
||||
<a href="<?=$linker->link(array('quests','submission',$seminary['url'],$similar['questgroup']['url'],$similar['quest']['url'],$similar['character']['url']))?>">
|
||||
<?=$similar['character']['name']?>,
|
||||
<?=$similar['quest']['title']?>
|
||||
</a>,
|
||||
<?=$dateFormatter->format(new \DateTime($similar['created']))?> <?=$timeFormatter->format(new \DateTime($similar['created']))?>
|
||||
</small></p>
|
||||
</li>
|
||||
</ul>
|
||||
<?php endforeach ?>
|
||||
<?php endif ?>
|
||||
</li>
|
||||
<?php endforeach ?>
|
||||
</ol>
|
||||
|
@ -24,10 +42,17 @@
|
|||
<form method="post" class="logreg">
|
||||
<?php $submission = array_pop($submissions); ?>
|
||||
<?php if(!$solved) : ?>
|
||||
<?=_('Comment')?><br />
|
||||
<textarea name="characterdata[comment]"></textarea><br />
|
||||
<input type="hidden" name="characterdata[submission_id]" value="<?=$submission['id']?>" />
|
||||
<fieldset>
|
||||
<legend><?=_('Comment')?></legend>
|
||||
<textarea id="characterdata-comment" name="characterdata[comment]"></textarea><br />
|
||||
<input type="hidden" name="characterdata[submission_id]" value="<?=$submission['id']?>" />
|
||||
</fieldset>
|
||||
<input type="submit" name="submit" value="<?=_('solved')?>" />
|
||||
<input type="submit" name="submit" value="<?=_('unsolved')?>" />
|
||||
<?php endif ?>
|
||||
</form>
|
||||
<script>
|
||||
$(function() {
|
||||
$("#characterdata-comment").markItUp(mySettings);
|
||||
});
|
||||
</script>
|
||||
|
|
|
@ -12,6 +12,7 @@ img{border:0}
|
|||
h1,h2,h3{color:#103a3e}
|
||||
h2{font-size:120%;margin-top:25px}
|
||||
h3{font-size:100%}
|
||||
h4{margin-bottom:0}
|
||||
ul,ol,nav{padding:0;margin-top:0;list-style-type:none}
|
||||
p{margin:0 0 16px;padding:0}
|
||||
audio,canvas,video{display:inline-block}
|
||||
|
|
Loading…
Add table
Reference in a new issue