diff options
Diffstat (limited to 'search.py')
-rw-r--r-- | search.py | 68 |
1 files changed, 50 insertions, 18 deletions
@@ -1,18 +1,50 @@ -import terms -import sys -import re - -def main(search_words): - - txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] - - search_words = [] - for i in txt: - search_words += re.split(r"\s+|\n", i) - - search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] - - print(search_words) - -if __name__ == "__main__": - main(sys.argv[1:])
\ No newline at end of file +import database
+import logging
+import terms
+import sys
+import re
+
+logging.basicConfig(
+ format = "[%(asctime)s]\t%(message)s",
+ level = logging.INFO,
+ handlers=[
+ logging.FileHandler("searches.log"),
+ logging.StreamHandler()
+])
+
+def main(search_words):
+
+ txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
+
+ search_words = []
+ for i in txt:
+ search_words += re.split(r"\s+|\n", i)
+
+ search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
+ logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+
+ with database.Database() as db:
+ tf_idf_scores = []
+ for term in search_words:
+ tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
+ logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
+
+ merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
+ for scorelist in tf_idf_scores:
+ for doc_id, score in scorelist.items():
+ merged_scores[doc_id] += score
+ logging.info("Merged scores...")
+
+ sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+ toshow = 20
+ print("Sorted scores...")
+
+ for i, j in enumerate(sorted_scores, 0):
+ if i >= toshow:
+ break
+
+ docid, score = j
+ logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
|