aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2021-11-26 17:57:07 +0000
committerjwansek <eddie.atten.ea29@gmail.com>2021-11-26 17:57:07 +0000
commit1f5dec8047af8c58ce3acb5014d82caf7e6766df (patch)
tree5d54f191befb210c733f7a5a85de2906c79509f0
parentfd2b9c85377df274514c6f0542cd6d1dbcbab183 (diff)
downloadsearchEngine-1f5dec8047af8c58ce3acb5014d82caf7e6766df.tar.gz
searchEngine-1f5dec8047af8c58ce3acb5014d82caf7e6766df.zip
split large texts up into more managable chunks
-rw-r--r--database.py10
-rw-r--r--terms.py15
2 files changed, 18 insertions, 7 deletions
diff --git a/database.py b/database.py
index 5c326b4..8fc3584 100644
--- a/database.py
+++ b/database.py
@@ -84,6 +84,11 @@ class Database:
cursor.execute("SELECT COUNT(*) FROM documents;")
return cursor.fetchone()[0]
+ def get_max_linked_terms(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT MAX(`document_id`) + 2 FROM term_weights;")
+ return cursor.fetchone()[0]
+
def append_terms(self, terms):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
@@ -211,5 +216,6 @@ if __name__ == "__main__":
# print(db.test_log(100))
# print(db.test_log(21))
# db.get_tf_idf_table()
- for i, v in db.get_tf_idf_score("enzyme", 1).items():
- print(i, v) \ No newline at end of file
+ #for i, v in db.get_tf_idf_score("enzyme", 1).items():
+ # print(i, v)
+ print(db.get_max_linked_terms())
diff --git a/terms.py b/terms.py
index 74eafd9..07055dd 100644
--- a/terms.py
+++ b/terms.py
@@ -30,15 +30,18 @@ LEM = WordNetLemmatizer()
def main():
numdocs = documents.get_num_documents()
+ with database.Database() as db:
+ startat = db.get_max_linked_terms() - 1
#docid = random.randint(1, numdocs)
#parse_document(docid, documents.get_document_name_by_id(docid), numdocs)
- for document_id in range(1, numdocs):
+ for document_id in range(startat, numdocs):
parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
#break
def parse_region(raw_text, region_weight, document_id):
+ print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text)))
terms = word_tokenize(raw_text)
terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms]
terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
@@ -85,10 +88,12 @@ def parse_document(document_id, document_path, numdocs):
weighted_linked_terms += region_linked_terms
# parse the main text, it has a weight of 1
- text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)]
- region_weighted_terms, region_linked_terms = parse_region(" ".join(text), 1, document_id)
- weighted_terms += region_weighted_terms
- weighted_linked_terms += region_linked_terms
+ text = " ".join([e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)])
+ # split large texts into more manageable chunks
+ for splittext in [text[i:i+99999] for i in range(0, len(text), 99999)]:
+ region_weighted_terms, region_linked_terms = parse_region(splittext, 1, document_id)
+ weighted_terms += region_weighted_terms
+ weighted_linked_terms += region_linked_terms
# parse html headers
elemtexts = []