From fae80d80bbfe7168535d45775f0e60abb897bf1b Mon Sep 17 00:00:00 2001 From: jwansek Date: Thu, 2 Dec 2021 17:31:36 +0000 Subject: added query expansion, linked term searches --- database.py | 140 +++++++++++++++++++++++++++++++++++++++++++++--------------- search.py | 85 ++++++++++++++++++++++++++---------- terms.py | 3 +- tf_idf.py | 1 + 4 files changed, 172 insertions(+), 57 deletions(-) diff --git a/database.py b/database.py index 8fc3584..4534097 100644 --- a/database.py +++ b/database.py @@ -2,6 +2,7 @@ from dataclasses import dataclass import math as maths import sqlite3 import tf_idf +import nltk import os class DatabaseCursor(sqlite3.Cursor): @@ -102,6 +103,16 @@ class Database: out[term] = cursor.fetchone()[0] return out + def get_linked_vocabulary_ids(self, linked_term): + ids = [] + print(linked_term) + for term in linked_term: + out.append(str(self.get_vocabulary_ids([term])[term])) + print(",".join(ids)) + + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT `vocabular") + def append_merged_terms(self, merged_terms): with self.__connection.cursor(factory = DatabaseCursor) as cursor: cursor.executemany("INSERT OR IGNORE INTO `linked_terms`(`chain`) VALUES (?);", [(i, ) for i in merged_terms]) @@ -128,30 +139,53 @@ class Database: def build_tf_idf_table(self): with self.__connection.cursor(factory = DatabaseCursor) as cursor: - # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;") - # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;") + cursor.execute("DROP VIEW IF EXISTS `tf_idf_linked`;") + cursor.execute("DROP VIEW IF EXISTS `tf_idf_singles`;") + cursor.execute("DROP VIEW IF EXISTS `vocab_count_singles`;") + cursor.execute("DROP VIEW IF EXISTS `vocab_count_linked`;") # i wish i could find a way to do this with a single view but alas i am # not good enough at SQL cursor.execute(""" - CREATE VIEW IF NOT EXISTS `vocab_count` AS + CREATE VIEW IF NOT EXISTS `vocab_count_singles` AS SELECT vocabulary_id, COUNT(vocabulary_id) AS vocabulary_count FROM term_weights GROUP BY vocabulary_id; """) cursor.execute(""" - CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT + CREATE VIEW IF NOT EXISTS `tf_idf_singles` AS SELECT `term_weights`.`vocabulary_id` AS `vocabulary_id`, `document_id`, `term_weights`.`weight`, LOG_TF(`weight`) AS tf, (SELECT COUNT(`document_id`) FROM `documents`) AS n, - `vocab_count`.`vocabulary_count` AS df, - (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf, - LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf + `vocab_count_singles`.`vocabulary_count` AS df, + (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS idf, + LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS tf_idf + FROM `term_weights` + INNER JOIN `vocab_count_singles` + ON `vocab_count_singles`.`vocabulary_id` = `term_weights`.`vocabulary_id` + ;""") + cursor.execute(""" + CREATE VIEW IF NOT EXISTS `vocab_count_linked` AS + SELECT linked_term_id, + COUNT(linked_term_id) AS linked_term_count + FROM term_weights + GROUP BY linked_term_id; + """) + cursor.execute(""" + CREATE VIEW IF NOT EXISTS `tf_idf_linked` AS SELECT + `term_weights`.`linked_term_id` AS `linked_term_id`, + `document_id`, + `term_weights`.`weight`, + LOG_TF(`weight`) AS tf, + (SELECT COUNT(`document_id`) FROM `documents`) AS n, + `vocab_count_linked`.`linked_term_count` AS df, + (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS idf, + LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS tf_idf FROM `term_weights` - INNER JOIN `vocab_count` - ON `vocab_count`.`vocabulary_id` = `term_weights`.`vocabulary_id` + INNER JOIN `vocab_count_linked` + ON `vocab_count_linked`.`linked_term_id` = `term_weights`.`linked_term_id` ;""") def get_term_frequencies(self): @@ -159,26 +193,6 @@ class Database: cursor.execute("SELECT * FROM `term_weights`;") return cursor.fetchall() - def append_tf_idf_table(self, tfs): - # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1]) - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.executemany(""" - INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) - VALUES ( - ?, ?, ?, - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?), - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?) - """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs]) - - def append_tf_idf_table_single(self, vocabulary_id, document_id, tf): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute(""" - INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) - VALUES ( - ?, ?, ?, - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?), - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?) - """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf)) def test_log(self, to_log): with self.__connection.cursor(factory = DatabaseCursor) as cursor: @@ -192,21 +206,79 @@ class Database: def get_tf_idf_table(self): with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;") + cursor.execute("SELECT * FROM `tf_idf_singles` LIMIT 100;") out = cursor.fetchall() print(len(out)) print(("vocabulary_id", "document_id", "weight", "tf", "n", "df", "idf")) for l in out[:100]: print(l) - def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000): + def get_tf_idf_table2(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT * FROM `tf_idf_linked` LIMIT 100;") + out = cursor.fetchall() + for l in out[:100]: + print(l) + + def get_tf_idf_score_single(self, term, tf_idf_thresh = 0, limit = 1000, multiplier = 1): with self.__connection.cursor(factory = DatabaseCursor) as cursor: cursor.execute(""" - SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = ( + SELECT `document_id`, `tf_idf` FROM `tf_idf_singles` WHERE `vocabulary_id` = ( SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ? ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?; """, (term, tf_idf_thresh, limit)) - return {i[0]: i[1] for i in cursor.fetchall()} + return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()} + + def get_tf_idf_score_linked(self, linked_term, tf_idf_thresh = 0, limit = 1000, multiplier = 1): + word_ids = [] + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + for word in linked_term: + cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, )) + try: + word_ids.append(str(cursor.fetchone()[0])) + except TypeError: + continue + + chain = ",".join(word_ids) + + cursor.execute(""" + SELECT `document_id`, `tf_idf` FROM `tf_idf_linked` WHERE `linked_term_id` = ( + SELECT `linked_term_id` FROM `linked_terms` WHERE `chain` = ? + ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?; + """, (chain, tf_idf_thresh, limit)) + return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()} + + def attempt_get_linked_words(self, words): + word_ids = [] + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + for word in words: + cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, )) + try: + word_ids.append(str(cursor.fetchone()[0])) + except TypeError: + continue + + combinations = [",".join(word_ids)] + for n in range(2, len(word_ids)): + combinations += [",".join(i) for i in nltk.ngrams(word_ids, n)] + # print(combinations) + + if len(combinations) == 0: + return [] + elif len(combinations) == 1: + sql = "SELECT `chain` FROM `linked_terms` WHERE `chain` = '%s';" % combinations[0] + else: + sql = "SELECT `chain` FROM `linked_terms` WHERE `chain` IN %s;" % str(tuple(combinations)) + # print(sql) + cursor.execute(sql) + found_ids = [i[0].split(",") for i in cursor.fetchall()] + + return [[self.get_term_by_id(j) for j in i] for i in found_ids] + + def get_term_by_id(self, id_): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT `term` FROM `vocabulary` WHERE `vocabulary_id` = ?;", (id_, )) + return cursor.fetchone()[0] if __name__ == "__main__": with Database() as db: @@ -218,4 +290,4 @@ if __name__ == "__main__": # db.get_tf_idf_table() #for i, v in db.get_tf_idf_score("enzyme", 1).items(): # print(i, v) - print(db.get_max_linked_terms()) + print(db.attempt_get_linked_words(["a", "computer", "science"])) diff --git a/search.py b/search.py index e6c3330..60cbd2a 100644 --- a/search.py +++ b/search.py @@ -1,6 +1,11 @@ +from nltk.corpus import wordnet +from nltk import pos_tag +import collections +import itertools import database import logging import terms +import time import sys import re @@ -12,32 +17,68 @@ logging.basicConfig( logging.StreamHandler() ]) +WORDNET_POS_MAP = { + 'NN': wordnet.NOUN, + 'NNS': wordnet.NOUN, + 'NNP': wordnet.NOUN, + 'NNPS': wordnet.NOUN, + 'JJ': [wordnet.ADJ, wordnet.ADJ_SAT], + 'JJS': [wordnet.ADJ, wordnet.ADJ_SAT], + 'RB': wordnet.ADV, + 'RBR': wordnet.ADV, + 'RBS': wordnet.ADV, + 'RP': [wordnet.ADJ, wordnet.ADJ_SAT], + 'VB': wordnet.VERB, +} + def main(search_words): - - txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] - - search_words = [] - for i in txt: - search_words += re.split(r"\s+|\n", i) - - search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] - logging.info("Started searching. Using terms: %s" % " ".join(search_words)) + starttime = time.time() + pos_tags = [(token, tag) for token, tag in pos_tag(search_words) if token.lower().replace(",", "") not in terms.STOPWORDS] + + single_terms = [w.lower() for w in search_words] + logging.info("Started with the terms: %s" % str(single_terms)) + with database.Database() as db: + l = db.attempt_get_linked_words(single_terms) + linked_terms = collections.Counter([",".join(i) for i in l]) + # do again so we get a weight of 2 + linked_terms += collections.Counter([",".join(i) for i in l]) + logging.info("Found the linked terms: %s" % str(l)) + + synsets = [wordnet.synsets(token, WORDNET_POS_MAP[tag]) for token, tag in pos_tags if WORDNET_POS_MAP.__contains__(tag)] + synonyms = list(itertools.chain.from_iterable([[lemma.name().lower().replace("_", ",") for syn in synset for lemma in syn.lemmas()] for synset in synsets])) + + # for syn in synsets: + # for sy in syn: + # print([w for s in sy.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]) + + for synonym in synonyms: + if len(synonym.split(",")) > 1: + linked_terms[synonym] = 1 + else: + single_terms.append(synonym) + + single_terms = collections.Counter(single_terms) + + logging.info("Expanded single terms to: %s" % str(single_terms)) + logging.info("Expanded linked terms to: %s" % str(linked_terms)) + logging.info("\n\n") with database.Database() as db: - tf_idf_scores = [] - for term in search_words: - tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000)) - logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term)) - - merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)} - for scorelist in tf_idf_scores: - for doc_id, score in scorelist.items(): - merged_scores[doc_id] += score - logging.info("Merged scores...") - - sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1]))) + tf_idf_scores = collections.Counter() + for single_term, search_weight in single_terms.items(): + scores = collections.Counter(db.get_tf_idf_score_single(single_term, tf_idf_thresh = 1, limit = 1000, multiplier = search_weight)) + logging.info("Got %d scores for term '%s' (multiplier %d)" % (len(scores), single_term, search_weight)) + tf_idf_scores += scores + + for linked_terms, search_weight in linked_terms.items(): + scores = db.get_tf_idf_score_linked(linked_terms.split(","), tf_idf_thresh=0, multiplier=search_weight) + logging.info("Got %d scores for linked term '%s' (multiplier %d)" % (len(scores), str(linked_terms), search_weight)) + tf_idf_scores += scores + + sorted_scores = list(reversed(sorted(tf_idf_scores.items(), key = lambda i: i[1]))) toshow = 30 logging.info("Sorted scores...") + logging.info("Results:\n\n") for i, j in enumerate(sorted_scores, 0): if i >= toshow: @@ -46,7 +87,7 @@ def main(search_words): docid, score = j logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid))) - logging.info("%d results found in total" % len([i[1] for i in sorted_scores if i[1] > 0.1])) + logging.info("Got %d results in total. Took %.2f minutes (%.2fs per term)" % (len(tf_idf_scores), (time.time() - starttime) / 60, (time.time() - starttime) / (len(single_terms) + len(linked_terms)))) if __name__ == "__main__": diff --git a/terms.py b/terms.py index 07055dd..b9d77e2 100644 --- a/terms.py +++ b/terms.py @@ -41,7 +41,7 @@ def main(): #break def parse_region(raw_text, region_weight, document_id): - print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text))) + print("d: %d; w: %d; len: %d" % (document_id, region_weight, len(raw_text)), end = "") terms = word_tokenize(raw_text) terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms] terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] @@ -71,6 +71,7 @@ def append_region(terms, linked_words, region_weight, document_id): weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()} weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()} + print("; t: %d; lt: %d" % (len(weighted_terms), len(weighted_linked_terms))) return weighted_terms, weighted_linked_terms def parse_document(document_id, document_path, numdocs): diff --git a/tf_idf.py b/tf_idf.py index 883f236..a4692f7 100644 --- a/tf_idf.py +++ b/tf_idf.py @@ -6,6 +6,7 @@ def main(): db.build_tf_idf_table() db.get_tf_idf_table() + db.get_tf_idf_table2() def calc_log_tf(tf): if tf == 0: -- cgit v1.2.3