diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2023-05-16 14:11:51 +0100 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2023-05-16 14:11:51 +0100 |
commit | 953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4 (patch) | |
tree | 08a4c9cd4f10a12927f7c57056617672369ed9b6 /src/parser.py | |
parent | 0faf95d56815d310290d3533d81d888deb7731f0 (diff) | |
download | UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.tar.gz UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.zip |
Added alt text, docker
Diffstat (limited to 'src/parser.py')
-rw-r--r-- | src/parser.py | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/src/parser.py b/src/parser.py new file mode 100644 index 0000000..f9e3e81 --- /dev/null +++ b/src/parser.py @@ -0,0 +1,85 @@ +import insinuations +import database +import datetime +import time +import json +import csv +import sys +import os + +def parse_csv(db, csv_path): + insinuations.get_sics(db) + + with open(csv_path, "r") as f: + num_lines = len(f.readlines()) + i = 0 + + with open(csv_path, "r") as f: + reader = csv.reader(f) + headers = next(reader) + + for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \ + diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \ + female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \ + female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \ + current_name, submitted_after_deadline, duedate, submitted_date in reader: + + if company_id.strip() != "": + try: + sic_codes = {int(i.strip()) for i in sic_codes.split(",")} + except ValueError: + sic_codes = set() + + while True: + try: + company = insinuations.lookup_company(company_id) + except ConnectionError as e: + print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e)) + time.sleep(20) + else: + break + + company["sics"] = company["sics"].union(sic_codes) + company["company_number"] = company_id + company["name"] = name + company["address"] = address + company["postcode"] = postcode + company["policy_link"] = policy_link + company["responsible_person"] = responsible_person + company["size"] = size + company["current_name"] = current_name + + print("%.2f%%" % ((i / num_lines) * 100), company) + print( + company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"), + diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent, + male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile, + male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, + female_upper_middle_quartile, male_top_quartile, female_top_quartile + ) + db.append_employer(**company) + db.append_pay_info( + company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"), + diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent, + male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile, + male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, + female_upper_middle_quartile, male_top_quartile, female_top_quartile + ) + i += 1 + + # break + +if __name__ == "__main__": + if not os.path.exists(".docker"): + import dotenv + dotenv.load_dotenv(dotenv_path = "db.env") + host = "srv.home" + else: + host = "db" + + with database.PayGapDatabase(host = host) as db: + p = sys.argv[1] + if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv": + db.append_counties(p) + else: + parse_csv(db, p)
\ No newline at end of file |