diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-25 15:55:59 +0000 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-25 15:55:59 +0000 |
commit | 95563cbf5c0d66016892e8d580033088f865010b (patch) | |
tree | 2abd09151ee57913d375af992d151af182f42a7d | |
parent | 5e4aee62e7a746491edd9077c0bdd959c444960b (diff) | |
download | searchEngine-95563cbf5c0d66016892e8d580033088f865010b.tar.gz searchEngine-95563cbf5c0d66016892e8d580033088f865010b.zip |
added linked terms, weighing depending on location
-rw-r--r-- | database.py | 81 | ||||
-rw-r--r-- | search.py | 7 | ||||
-rw-r--r-- | terms.py | 114 |
3 files changed, 161 insertions, 41 deletions
diff --git a/database.py b/database.py index 875d22e..5c326b4 100644 --- a/database.py +++ b/database.py @@ -45,14 +45,23 @@ class Database: `vocabulary_id` INTEGER PRIMARY KEY,
`term` TEXT NOT NULL
);""")
+ # \/ VERY UGLY NOT HOW THOS SHOULD BE DONE
cursor.execute("""
- CREATE TABLE IF NOT EXISTS `term_frequency` (
- `vocabulary_id` INT UNSIGNED NOT NULL,
+ CREATE TABLE IF NOT EXISTS `linked_terms` (
+ `linked_term_id` INTEGER PRIMARY KEY,
+ `chain` TEXT NOT NULL
+ );""")
+ cursor.execute("CREATE UNIQUE INDEX unique_chain on linked_terms (chain);")
+ cursor.execute("""
+ CREATE TABLE IF NOT EXISTS `term_weights` (
+ `term_weight_id` INTEGER PRIMARY KEY,
+ `vocabulary_id` INT UNSIGNED NULL,
+ `linked_term_id` INT UNSIGNED NULL,
`document_id` INT UNSIGNED NOT NULL,
- `frequency` INT UNSIGNED NOT NULL,
- FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`),
- FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`),
- PRIMARY KEY(`vocabulary_id`, `document_id`)
+ `weight` INT UNSIGNED NOT NULL,
+ FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`),
+ FOREIGN KEY (`linked_term_id`) REFERENCES `linked_terms`(`linked_term_id`),
+ FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`)
);""")
cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
@@ -79,12 +88,38 @@ class Database: with self.__connection.cursor(factory = DatabaseCursor) as cursor:
cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
- def append_terms_in_document(self, document_id, counter):
+ def get_vocabulary_ids(self, terms):
+ # can't do executemany with select statements :(
+ out = {}
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ for term in terms:
+ cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?;", (term, ))
+ out[term] = cursor.fetchone()[0]
+ return out
+
+ def append_merged_terms(self, merged_terms):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("INSERT OR IGNORE INTO `linked_terms`(`chain`) VALUES (?);", [(i, ) for i in merged_terms])
+
+ def append_document_term_weights(self, terms, document_id):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("""
+ INSERT INTO `term_weights`
+ (`vocabulary_id`, `linked_term_id`, `document_id`, `weight`)
+ VALUES ((
+ SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
+ ), NULL, ?, ?);
+ """, [(i[0], document_id, i[1]) for i in terms.items()])
+
+ def append_document_linked_term_weights(self, linked_terms, document_id):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
cursor.executemany("""
- INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`)
- VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
- """, [(i[0], document_id, i[1]) for i in counter.items()])
+ INSERT INTO `term_weights`
+ (`vocabulary_id`, `linked_term_id`, `document_id`, `weight`)
+ VALUES (NULL, (
+ SELECT `linked_term_id` FROM `linked_terms` WHERE `chain` = ?
+ ), ?, ?);
+ """, [(i[0], document_id, i[1]) for i in linked_terms.items()])
def build_tf_idf_table(self):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
@@ -96,27 +131,27 @@ class Database: CREATE VIEW IF NOT EXISTS `vocab_count` AS
SELECT vocabulary_id,
COUNT(vocabulary_id) AS vocabulary_count
- FROM term_frequency
+ FROM term_weights
GROUP BY vocabulary_id;
""")
cursor.execute("""
CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
- `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
+ `term_weights`.`vocabulary_id` AS `vocabulary_id`,
`document_id`,
- `term_frequency`.`frequency`,
- LOG_TF(`frequency`) AS tf,
+ `term_weights`.`weight`,
+ LOG_TF(`weight`) AS tf,
(SELECT COUNT(`document_id`) FROM `documents`) AS n,
`vocab_count`.`vocabulary_count` AS df,
(SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
- LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
- FROM `term_frequency`
+ LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+ FROM `term_weights`
INNER JOIN `vocab_count`
- ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
+ ON `vocab_count`.`vocabulary_id` = `term_weights`.`vocabulary_id`
;""")
def get_term_frequencies(self):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
- cursor.execute("SELECT * FROM `term_frequency`;")
+ cursor.execute("SELECT * FROM `term_weights`;")
return cursor.fetchall()
def append_tf_idf_table(self, tfs):
@@ -126,8 +161,8 @@ class Database: INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
VALUES (
?, ?, ?,
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
""", [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
@@ -136,8 +171,8 @@ class Database: INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
VALUES (
?, ?, ?,
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
""", (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
def test_log(self, to_log):
@@ -155,7 +190,7 @@ class Database: cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
out = cursor.fetchall()
print(len(out))
- print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
+ print(("vocabulary_id", "document_id", "weight", "tf", "n", "df", "idf"))
for l in out[:100]:
print(l)
@@ -36,8 +36,8 @@ def main(search_words): logging.info("Merged scores...")
sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
- toshow = 20
- print("Sorted scores...")
+ toshow = 30
+ logging.info("Sorted scores...")
for i, j in enumerate(sorted_scores, 0):
if i >= toshow:
@@ -45,6 +45,9 @@ def main(search_words): docid, score = j
logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+
+ logging.info("%d results found in total" % len([i[1] for i in sorted_scores if i[1] > 0.1]))
+
if __name__ == "__main__":
main(sys.argv[1:])
@@ -2,13 +2,24 @@ from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer -from nltk.util import ngrams -import collections +from nltk.tokenize import word_tokenize +from nltk.util import ngrams +from nltk import pos_tag +import collections +import itertools import documents import database +import random +import nltk +import time import bs4 +import os import re +import spacy +from spacy import displacy +nlp = spacy.load("en_core_web_sm") + STOPWORDS = set(stopwords.words('english')).difference({ "how", "because", "through", "or", "as", "about", "not", "no", "who", "of", "can", "over", "you" @@ -17,31 +28,102 @@ LEM = WordNetLemmatizer() def main(): numdocs = documents.get_num_documents() - for document_id in range(1, numdocs): - parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) + docid = random.randint(1, numdocs) + parse_document(docid, documents.get_document_name_by_id(docid), numdocs) + + # for document_id in range(1, numdocs): + # parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) + + # # break + +def parse_region(raw_text, region_weight, document_id): + terms = word_tokenize(raw_text) + terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms] + terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] + + processed = nlp(raw_text) + linked_words = [] + for ent in processed.ents: + words = [ + re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() + for word in word_tokenize(ent.text) + if re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() != "" + ] + if len(words) > 1: + linked_words.append(words) + + return append_region(terms, linked_words, region_weight, document_id) + +def append_region(terms, linked_words, region_weight, document_id): + flattened_linked_words = set(itertools.chain.from_iterable(linked_words)) + with database.Database() as db: + db.append_terms(flattened_linked_words.union(set(terms))) + ids = db.get_vocabulary_ids(flattened_linked_words) + + linked_words_ids = [str([ids[j] for j in i])[1:-1].replace(" ", "") for i in linked_words] + db.append_merged_terms(linked_words_ids) - # break + weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()} + weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()} + + return weighted_terms, weighted_linked_terms def parse_document(document_id, document_path, numdocs): + starttime = time.time() with open(document_path, "r") as f: soup = bs4.BeautifulSoup(f.read(), "lxml") + weighted_terms = collections.Counter() + weighted_linked_terms = collections.Counter() + + # parse the file name, it has a weight of 100 + filenametext = os.path.splitext(os.path.split(document_path)[-1])[0] + region_weighted_terms, region_linked_terms = parse_region(filenametext, 100, document_id) + weighted_terms += region_weighted_terms + weighted_linked_terms += region_linked_terms + + # parse the main text, it has a weight of 1 text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)] - text = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in text] - - terms = [] - for i in text: - terms += re.split(r"\s+|\n", i) + region_weighted_terms, region_linked_terms = parse_region(" ".join(text), 1, document_id) + weighted_terms += region_weighted_terms + weighted_linked_terms += region_linked_terms + + # parse html headers + elemtexts = [] + try: + elemtexts += [e.text for e in soup.h1.findChildren(recursive = True)] + except AttributeError: + pass - terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] - terms_counted = collections.Counter(terms) + try: + elemtexts += [e.text for e in soup.h2.findChildren(recursive = True)] + except AttributeError: + pass + + region_weighted_terms, region_linked_terms = parse_region(re.sub(r"edit|Contents|source", "", " ".join(elemtexts)), 50, document_id) + weighted_terms += region_weighted_terms + weighted_linked_terms += region_linked_terms + + # parse html link elements texts, has a weight of 10 + a_texts = [e.text for e in soup.select("a") if e.text != "" and e.text != "edit" and e.text != "edit source"] + region_weighted_terms, region_linked_terms = parse_region(" ".join(a_texts), 10, document_id) + weighted_terms += region_weighted_terms + weighted_linked_terms += region_linked_terms with database.Database() as db: - db.append_terms(terms) - print("[%f%%] Added %d terms from docid: %d" % ((document_id/numdocs)*100, len(terms_counted), document_id)) + db.append_document_term_weights(weighted_terms, document_id) + db.append_document_linked_term_weights(weighted_linked_terms, document_id) + + print("[%.3f%%] {%.1fs} %d terms (weight %d), %d linked terms (weight %d) - %s" % ( + (document_id/numdocs)*100, + time.time() - starttime, + len(weighted_terms), + sum(weighted_terms.values()), + len(weighted_linked_terms), + sum(weighted_linked_terms.values()), + document_path + )) - db.append_terms_in_document(document_id, terms_counted) - print("Appended term frequency too") if __name__ == "__main__": main()
\ No newline at end of file |