From 95563cbf5c0d66016892e8d580033088f865010b Mon Sep 17 00:00:00 2001
From: jwansek <eddie.atten.ea29@gmail.com>
Date: Thu, 25 Nov 2021 15:55:59 +0000
Subject: added linked terms, weighing depending on location

---
 database.py |  81 ++++++++++++++++++++++++++++++------------
 search.py   |   7 ++--
 terms.py    | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 161 insertions(+), 41 deletions(-)

diff --git a/database.py b/database.py
index 875d22e..5c326b4 100644
--- a/database.py
+++ b/database.py
@@ -45,14 +45,23 @@ class Database:
                 `vocabulary_id` INTEGER PRIMARY KEY,
                 `term` TEXT NOT NULL
             );""")
+            # \/ VERY UGLY NOT HOW THOS SHOULD BE DONE
             cursor.execute("""
-            CREATE TABLE IF NOT EXISTS `term_frequency` ( 
-                `vocabulary_id` INT UNSIGNED NOT NULL, 
+            CREATE TABLE IF NOT EXISTS `linked_terms` (
+                `linked_term_id` INTEGER PRIMARY KEY,
+                `chain` TEXT NOT NULL
+            );""")
+            cursor.execute("CREATE UNIQUE INDEX unique_chain on linked_terms (chain);")
+            cursor.execute("""
+            CREATE TABLE IF NOT EXISTS `term_weights` (
+                `term_weight_id` INTEGER PRIMARY KEY, 
+                `vocabulary_id` INT UNSIGNED NULL,
+                `linked_term_id` INT UNSIGNED NULL, 
                 `document_id` INT UNSIGNED NOT NULL, 
-                `frequency` INT UNSIGNED NOT NULL, 
-                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), 
-                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), 
-                PRIMARY KEY(`vocabulary_id`, `document_id`) 
+                `weight` INT UNSIGNED NOT NULL, 
+                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`),
+                FOREIGN KEY (`linked_term_id`) REFERENCES `linked_terms`(`linked_term_id`), 
+                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`)
             );""")
             cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
 
@@ -79,12 +88,38 @@ class Database:
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
             cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
 
-    def append_terms_in_document(self, document_id, counter):
+    def get_vocabulary_ids(self, terms):
+        # can't do executemany with select statements :(
+        out = {}
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            for term in terms:
+                cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?;", (term, )) 
+                out[term] = cursor.fetchone()[0]
+        return out
+
+    def append_merged_terms(self, merged_terms):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("INSERT OR IGNORE INTO `linked_terms`(`chain`) VALUES (?);", [(i, ) for i in merged_terms])
+
+    def append_document_term_weights(self, terms, document_id):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("""
+            INSERT INTO `term_weights` 
+            (`vocabulary_id`, `linked_term_id`, `document_id`, `weight`)
+            VALUES ((
+                SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
+            ), NULL, ?, ?);
+            """, [(i[0], document_id, i[1]) for i in terms.items()])
+
+    def append_document_linked_term_weights(self, linked_terms, document_id):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
             cursor.executemany("""
-            INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) 
-            VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
-            """, [(i[0], document_id, i[1]) for i in counter.items()])
+            INSERT INTO `term_weights` 
+            (`vocabulary_id`, `linked_term_id`, `document_id`, `weight`)
+            VALUES (NULL, (
+                SELECT `linked_term_id` FROM `linked_terms` WHERE `chain` = ?
+            ), ?, ?);
+            """, [(i[0], document_id, i[1]) for i in linked_terms.items()])
 
     def build_tf_idf_table(self):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
@@ -96,27 +131,27 @@ class Database:
             CREATE VIEW IF NOT EXISTS `vocab_count` AS 
             SELECT vocabulary_id, 
             COUNT(vocabulary_id) AS vocabulary_count
-            FROM term_frequency 
+            FROM term_weights 
             GROUP BY vocabulary_id;
             """)
             cursor.execute("""
             CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
-            `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
+            `term_weights`.`vocabulary_id` AS `vocabulary_id`,
             `document_id`,
-            `term_frequency`.`frequency`,
-            LOG_TF(`frequency`) AS tf,
+            `term_weights`.`weight`,
+            LOG_TF(`weight`) AS tf,
             (SELECT COUNT(`document_id`) FROM `documents`) AS n,
             `vocab_count`.`vocabulary_count` AS df,
             (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
-            LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
-            FROM `term_frequency`
+            LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+            FROM `term_weights`
             INNER JOIN `vocab_count`
-            ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
+            ON `vocab_count`.`vocabulary_id` = `term_weights`.`vocabulary_id`
             ;""")
 
     def get_term_frequencies(self):
         with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT * FROM `term_frequency`;")
+            cursor.execute("SELECT * FROM `term_weights`;")
             return cursor.fetchall()
 
     def append_tf_idf_table(self, tfs):
@@ -126,8 +161,8 @@ class Database:
             INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
             VALUES (
                 ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
             """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
 
     def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
@@ -136,8 +171,8 @@ class Database:
             INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
             VALUES (
                 ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
             """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
 
     def test_log(self, to_log):
@@ -155,7 +190,7 @@ class Database:
             cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
             out = cursor.fetchall()
             print(len(out))
-            print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
+            print(("vocabulary_id", "document_id", "weight", "tf", "n", "df", "idf"))
             for l in out[:100]:
                 print(l)
 
diff --git a/search.py b/search.py
index 60876d9..e6c3330 100644
--- a/search.py
+++ b/search.py
@@ -36,8 +36,8 @@ def main(search_words):
         logging.info("Merged scores...")
 
         sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
-        toshow = 20
-        print("Sorted scores...")
+        toshow = 30
+        logging.info("Sorted scores...")
 
         for i, j in enumerate(sorted_scores, 0):
             if i >= toshow:
@@ -45,6 +45,9 @@ def main(search_words):
 
             docid, score = j
             logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+
+    logging.info("%d results found in total" % len([i[1] for i in sorted_scores if i[1] > 0.1]))    
+        
         
 if __name__ == "__main__":
     main(sys.argv[1:])
diff --git a/terms.py b/terms.py
index ab3fcfc..5da0acc 100644
--- a/terms.py
+++ b/terms.py
@@ -2,13 +2,24 @@ from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from nltk.stem import WordNetLemmatizer
-from nltk.util import ngrams  
-import collections  
+from nltk.tokenize import word_tokenize
+from nltk.util import ngrams
+from nltk import pos_tag  
+import collections
+import itertools  
 import documents
 import database
+import random
+import nltk
+import time
 import bs4
+import os
 import re
 
+import spacy
+from spacy import displacy
+nlp = spacy.load("en_core_web_sm")
+
 STOPWORDS = set(stopwords.words('english')).difference({
     "how", "because", "through", "or", "as", "about", "not",
     "no", "who", "of", "can", "over", "you"
@@ -17,31 +28,102 @@ LEM = WordNetLemmatizer()
 
 def main():
     numdocs = documents.get_num_documents()
-    for document_id in range(1, numdocs):
-        parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
+    docid = random.randint(1, numdocs)
+    parse_document(docid, documents.get_document_name_by_id(docid), numdocs)
+
+    # for document_id in range(1, numdocs):
+    #     parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
+
+    #     # break
+
+def parse_region(raw_text, region_weight, document_id):
+    terms = word_tokenize(raw_text)
+    terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms]
+    terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
+
+    processed = nlp(raw_text)
+    linked_words = []
+    for ent in processed.ents:
+        words = [
+            re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() 
+            for word in word_tokenize(ent.text) 
+            if re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() != ""
+        ]
+        if len(words) > 1:
+            linked_words.append(words)
+
+    return append_region(terms, linked_words, region_weight, document_id)
+
+def append_region(terms, linked_words, region_weight, document_id):
+    flattened_linked_words = set(itertools.chain.from_iterable(linked_words))
+    with database.Database() as db:
+        db.append_terms(flattened_linked_words.union(set(terms)))
+        ids = db.get_vocabulary_ids(flattened_linked_words)
+        
+        linked_words_ids = [str([ids[j] for j in i])[1:-1].replace(" ", "") for i in linked_words]
+        db.append_merged_terms(linked_words_ids)
 
-        # break
+    weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()}
+    weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()}
+
+    return weighted_terms, weighted_linked_terms
 
 def parse_document(document_id, document_path, numdocs):
+    starttime = time.time()
     with open(document_path, "r") as f:
        soup = bs4.BeautifulSoup(f.read(), "lxml")
 
+    weighted_terms = collections.Counter()
+    weighted_linked_terms = collections.Counter()
+
+    # parse the file name, it has a weight of 100
+    filenametext = os.path.splitext(os.path.split(document_path)[-1])[0]
+    region_weighted_terms, region_linked_terms = parse_region(filenametext, 100, document_id)
+    weighted_terms += region_weighted_terms
+    weighted_linked_terms += region_linked_terms
+
+    # parse the main text, it has a weight of 1
     text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)]
-    text = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in text]
-    
-    terms = []
-    for i in text:
-        terms += re.split(r"\s+|\n", i)
+    region_weighted_terms, region_linked_terms = parse_region(" ".join(text), 1, document_id)
+    weighted_terms += region_weighted_terms
+    weighted_linked_terms += region_linked_terms
+
+    # parse html headers
+    elemtexts = []
+    try:
+        elemtexts += [e.text for e in soup.h1.findChildren(recursive = True)]
+    except AttributeError:
+        pass
     
-    terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
-    terms_counted = collections.Counter(terms)
+    try:
+        elemtexts += [e.text for e in soup.h2.findChildren(recursive = True)]
+    except AttributeError:
+        pass
+
+    region_weighted_terms, region_linked_terms = parse_region(re.sub(r"edit|Contents|source", "", " ".join(elemtexts)), 50, document_id)
+    weighted_terms += region_weighted_terms
+    weighted_linked_terms += region_linked_terms
+
+    # parse html link elements texts, has a weight of 10
+    a_texts = [e.text for e in soup.select("a") if e.text != "" and e.text != "edit" and e.text != "edit source"]
+    region_weighted_terms, region_linked_terms = parse_region(" ".join(a_texts), 10, document_id)
+    weighted_terms += region_weighted_terms
+    weighted_linked_terms += region_linked_terms
 
     with database.Database() as db:
-        db.append_terms(terms)
-        print("[%f%%] Added %d terms from docid: %d" % ((document_id/numdocs)*100, len(terms_counted), document_id)) 
+        db.append_document_term_weights(weighted_terms, document_id)
+        db.append_document_linked_term_weights(weighted_linked_terms, document_id)
+
+    print("[%.3f%%] {%.1fs} %d terms (weight %d), %d linked terms (weight %d) - %s" % (
+        (document_id/numdocs)*100,
+        time.time() - starttime, 
+        len(weighted_terms), 
+        sum(weighted_terms.values()), 
+        len(weighted_linked_terms), 
+        sum(weighted_linked_terms.values()),
+        document_path
+    ))
 
-        db.append_terms_in_document(document_id, terms_counted)
-        print("Appended term frequency too")
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
-- 
cgit v1.2.3