From fae80d80bbfe7168535d45775f0e60abb897bf1b Mon Sep 17 00:00:00 2001
From: jwansek <eddie.atten.ea29@gmail.com>
Date: Thu, 2 Dec 2021 17:31:36 +0000
Subject: added query expansion, linked term searches

---
 database.py | 140 +++++++++++++++++++++++++++++++++++++++++++++---------------
 search.py   |  85 ++++++++++++++++++++++++++----------
 terms.py    |   3 +-
 tf_idf.py   |   1 +
 4 files changed, 172 insertions(+), 57 deletions(-)

diff --git a/database.py b/database.py
index 8fc3584..4534097 100644
--- a/database.py
+++ b/database.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 import math as maths
 import sqlite3
 import tf_idf
+import nltk
 import os
 
 class DatabaseCursor(sqlite3.Cursor):
@@ -102,6 +103,16 @@ class Database:
                 out[term] = cursor.fetchone()[0]
         return out
 
+    def get_linked_vocabulary_ids(self, linked_term):
+        ids = []
+        print(linked_term)
+        for term in linked_term:
+            out.append(str(self.get_vocabulary_ids([term])[term]))
+        print(",".join(ids))
+
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT `vocabular")
+
     def append_merged_terms(self, merged_terms):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
             cursor.executemany("INSERT OR IGNORE INTO `linked_terms`(`chain`) VALUES (?);", [(i, ) for i in merged_terms])
@@ -128,30 +139,53 @@ class Database:
 
     def build_tf_idf_table(self):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
-            # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
+            cursor.execute("DROP VIEW IF EXISTS `tf_idf_linked`;")
+            cursor.execute("DROP VIEW IF EXISTS `tf_idf_singles`;")
+            cursor.execute("DROP VIEW IF EXISTS `vocab_count_singles`;")
+            cursor.execute("DROP VIEW IF EXISTS `vocab_count_linked`;")
             # i wish i could find a way to do this with a single view but alas i am
             # not good enough at SQL
             cursor.execute("""
-            CREATE VIEW IF NOT EXISTS `vocab_count` AS 
+            CREATE VIEW IF NOT EXISTS `vocab_count_singles` AS 
             SELECT vocabulary_id, 
             COUNT(vocabulary_id) AS vocabulary_count
             FROM term_weights 
             GROUP BY vocabulary_id;
             """)
             cursor.execute("""
-            CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
+            CREATE VIEW IF NOT EXISTS `tf_idf_singles` AS SELECT
             `term_weights`.`vocabulary_id` AS `vocabulary_id`,
             `document_id`,
             `term_weights`.`weight`,
             LOG_TF(`weight`) AS tf,
             (SELECT COUNT(`document_id`) FROM `documents`) AS n,
-            `vocab_count`.`vocabulary_count` AS df,
-            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
-            LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+            `vocab_count_singles`.`vocabulary_count` AS df,
+            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS idf,
+            LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS tf_idf
+            FROM `term_weights`
+            INNER JOIN `vocab_count_singles`
+            ON `vocab_count_singles`.`vocabulary_id` = `term_weights`.`vocabulary_id`
+            ;""")
+            cursor.execute("""
+            CREATE VIEW IF NOT EXISTS `vocab_count_linked` AS
+            SELECT linked_term_id, 
+            COUNT(linked_term_id) AS linked_term_count
+            FROM term_weights 
+            GROUP BY linked_term_id;
+            """)
+            cursor.execute("""
+            CREATE VIEW IF NOT EXISTS `tf_idf_linked` AS SELECT
+            `term_weights`.`linked_term_id` AS `linked_term_id`,
+            `document_id`,
+            `term_weights`.`weight`,
+            LOG_TF(`weight`) AS tf,
+            (SELECT COUNT(`document_id`) FROM `documents`) AS n,
+            `vocab_count_linked`.`linked_term_count` AS df,
+            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS idf,
+            LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS tf_idf
             FROM `term_weights`
-            INNER JOIN `vocab_count`
-            ON `vocab_count`.`vocabulary_id` = `term_weights`.`vocabulary_id`
+            INNER JOIN `vocab_count_linked`
+            ON `vocab_count_linked`.`linked_term_id` = `term_weights`.`linked_term_id`
             ;""")
 
     def get_term_frequencies(self):
@@ -159,26 +193,6 @@ class Database:
             cursor.execute("SELECT * FROM `term_weights`;")
             return cursor.fetchall()
 
-    def append_tf_idf_table(self, tfs):
-        # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.executemany("""
-            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
-            VALUES (
-                ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
-            """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
-
-    def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("""
-            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
-            VALUES (
-                ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
-            """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
 
     def test_log(self, to_log):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
@@ -192,21 +206,79 @@ class Database:
 
     def get_tf_idf_table(self):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
+            cursor.execute("SELECT * FROM `tf_idf_singles` LIMIT 100;")
             out = cursor.fetchall()
             print(len(out))
             print(("vocabulary_id", "document_id", "weight", "tf", "n", "df", "idf"))
             for l in out[:100]:
                 print(l)
 
-    def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000):
+    def get_tf_idf_table2(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT * FROM `tf_idf_linked` LIMIT 100;")
+            out = cursor.fetchall()
+            for l in out[:100]:
+                print(l)
+
+    def get_tf_idf_score_single(self, term, tf_idf_thresh = 0, limit = 1000, multiplier = 1):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
             cursor.execute("""
-            SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = (
+            SELECT `document_id`, `tf_idf` FROM `tf_idf_singles` WHERE `vocabulary_id` = (
                 SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
             ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
             """, (term, tf_idf_thresh, limit))
-            return {i[0]: i[1] for i in cursor.fetchall()}
+            return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()}
+
+    def get_tf_idf_score_linked(self, linked_term, tf_idf_thresh = 0, limit = 1000, multiplier = 1):
+        word_ids = []
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            for word in linked_term:
+                cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, ))
+                try:
+                    word_ids.append(str(cursor.fetchone()[0]))
+                except TypeError:
+                    continue
+        
+            chain = ",".join(word_ids)
+
+            cursor.execute("""
+            SELECT `document_id`, `tf_idf` FROM `tf_idf_linked` WHERE `linked_term_id` = (
+                SELECT `linked_term_id` FROM `linked_terms` WHERE `chain` = ?
+            ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
+            """, (chain, tf_idf_thresh, limit))
+            return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()}
+
+    def attempt_get_linked_words(self, words):
+        word_ids = []
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            for word in words:
+                cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, ))
+                try:
+                    word_ids.append(str(cursor.fetchone()[0]))
+                except TypeError:
+                    continue
+        
+            combinations = [",".join(word_ids)]
+            for n in range(2, len(word_ids)):
+                combinations += [",".join(i) for i in nltk.ngrams(word_ids, n)]
+            # print(combinations)
+
+            if len(combinations) == 0:
+                return []
+            elif len(combinations) == 1:
+                sql =  "SELECT `chain` FROM `linked_terms` WHERE `chain` = '%s';" % combinations[0]
+            else:
+                sql = "SELECT `chain` FROM `linked_terms` WHERE `chain` IN %s;" % str(tuple(combinations))
+            # print(sql)
+            cursor.execute(sql)
+            found_ids = [i[0].split(",") for i in cursor.fetchall()]
+
+        return [[self.get_term_by_id(j) for j in i] for i in found_ids]
+
+    def get_term_by_id(self, id_):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT `term` FROM `vocabulary` WHERE `vocabulary_id` = ?;", (id_, ))
+            return cursor.fetchone()[0]       
 
 if __name__ == "__main__":
     with Database() as db:
@@ -218,4 +290,4 @@ if __name__ == "__main__":
         # db.get_tf_idf_table()
         #for i, v in db.get_tf_idf_score("enzyme", 1).items():
         #    print(i, v)
-        print(db.get_max_linked_terms())
+        print(db.attempt_get_linked_words(["a", "computer", "science"]))
diff --git a/search.py b/search.py
index e6c3330..60cbd2a 100644
--- a/search.py
+++ b/search.py
@@ -1,6 +1,11 @@
+from nltk.corpus import wordnet
+from nltk import pos_tag
+import collections
+import itertools  
 import database
 import logging
 import terms
+import time
 import sys
 import re
 
@@ -12,32 +17,68 @@ logging.basicConfig(
         logging.StreamHandler()
 ])
 
+WORDNET_POS_MAP = {
+    'NN': wordnet.NOUN,          
+    'NNS': wordnet.NOUN,                 
+    'NNP': wordnet.NOUN,        
+    'NNPS': wordnet.NOUN,
+    'JJ': [wordnet.ADJ, wordnet.ADJ_SAT],
+    'JJS': [wordnet.ADJ, wordnet.ADJ_SAT],
+    'RB': wordnet.ADV,    
+    'RBR': wordnet.ADV,        
+    'RBS': wordnet.ADV,     
+    'RP': [wordnet.ADJ, wordnet.ADJ_SAT], 
+    'VB': wordnet.VERB,
+}
+
 def main(search_words):
-    
-    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
-    
-    search_words = []
-    for i in txt:
-        search_words += re.split(r"\s+|\n", i)
-    
-    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
-    logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+    starttime = time.time()
+    pos_tags = [(token, tag) for token, tag in pos_tag(search_words) if token.lower().replace(",", "") not in terms.STOPWORDS]
+
+    single_terms = [w.lower() for w in search_words]
+    logging.info("Started with the terms: %s" % str(single_terms))
+    with database.Database() as db:
+        l = db.attempt_get_linked_words(single_terms)
+    linked_terms = collections.Counter([",".join(i) for i in l])
+    # do again so we get a weight of 2
+    linked_terms += collections.Counter([",".join(i) for i in l])
+    logging.info("Found the linked terms: %s" % str(l))
+
+    synsets = [wordnet.synsets(token, WORDNET_POS_MAP[tag]) for token, tag in pos_tags if WORDNET_POS_MAP.__contains__(tag)]
+    synonyms = list(itertools.chain.from_iterable([[lemma.name().lower().replace("_", ",") for syn in synset for lemma in syn.lemmas()] for synset in synsets]))
+
+    # for syn in synsets:
+    #     for sy in syn:
+    #         print([w for s in sy.closure(lambda s:s.hyponyms()) for w in s.lemma_names()])
+
+    for synonym in synonyms:
+        if len(synonym.split(",")) > 1:
+            linked_terms[synonym] = 1
+        else:
+            single_terms.append(synonym)
+
+    single_terms = collections.Counter(single_terms)
+
+    logging.info("Expanded single terms to: %s" % str(single_terms))
+    logging.info("Expanded linked terms to: %s" % str(linked_terms))
+    logging.info("\n\n")
 
     with database.Database() as db:
-        tf_idf_scores = []
-        for term in search_words:
-            tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
-            logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
-
-        merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
-        for scorelist in tf_idf_scores:
-            for doc_id, score in scorelist.items():
-                merged_scores[doc_id] += score
-        logging.info("Merged scores...")
-
-        sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+        tf_idf_scores = collections.Counter()
+        for single_term, search_weight in single_terms.items():
+            scores = collections.Counter(db.get_tf_idf_score_single(single_term, tf_idf_thresh = 1, limit = 1000, multiplier = search_weight))
+            logging.info("Got %d scores for term '%s' (multiplier %d)" % (len(scores), single_term, search_weight))
+            tf_idf_scores += scores
+
+        for linked_terms, search_weight in linked_terms.items():
+            scores = db.get_tf_idf_score_linked(linked_terms.split(","), tf_idf_thresh=0, multiplier=search_weight)
+            logging.info("Got %d scores for linked term '%s' (multiplier %d)" % (len(scores), str(linked_terms), search_weight))
+            tf_idf_scores += scores
+        
+        sorted_scores = list(reversed(sorted(tf_idf_scores.items(), key = lambda i: i[1])))
         toshow = 30
         logging.info("Sorted scores...")
+        logging.info("Results:\n\n")
 
         for i, j in enumerate(sorted_scores, 0):
             if i >= toshow:
@@ -46,7 +87,7 @@ def main(search_words):
             docid, score = j
             logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
 
-    logging.info("%d results found in total" % len([i[1] for i in sorted_scores if i[1] > 0.1]))    
+    logging.info("Got %d results in total. Took %.2f minutes (%.2fs per term)" % (len(tf_idf_scores), (time.time() - starttime) / 60, (time.time() - starttime) / (len(single_terms) + len(linked_terms))))
         
         
 if __name__ == "__main__":
diff --git a/terms.py b/terms.py
index 07055dd..b9d77e2 100644
--- a/terms.py
+++ b/terms.py
@@ -41,7 +41,7 @@ def main():
         #break
 
 def parse_region(raw_text, region_weight, document_id):
-    print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text)))
+    print("d: %d; w: %d; len: %d" % (document_id, region_weight, len(raw_text)), end = "")
     terms = word_tokenize(raw_text)
     terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms]
     terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
@@ -71,6 +71,7 @@ def append_region(terms, linked_words, region_weight, document_id):
     weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()}
     weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()}
 
+    print("; t: %d; lt: %d" % (len(weighted_terms), len(weighted_linked_terms)))
     return weighted_terms, weighted_linked_terms
 
 def parse_document(document_id, document_path, numdocs):
diff --git a/tf_idf.py b/tf_idf.py
index 883f236..a4692f7 100644
--- a/tf_idf.py
+++ b/tf_idf.py
@@ -6,6 +6,7 @@ def main():
         db.build_tf_idf_table()
 
         db.get_tf_idf_table()
+        db.get_tf_idf_table2()
 
 def calc_log_tf(tf):
     if tf == 0:
-- 
cgit v1.2.3