diff options
| author | jwansek <eddie.atten.ea29@gmail.com> | 2023-04-28 12:03:19 +0100 | 
|---|---|---|
| committer | jwansek <eddie.atten.ea29@gmail.com> | 2023-04-28 12:03:19 +0100 | 
| commit | 43ba613b9e1ea9dbf6f94361d43418c91c8c0785 (patch) | |
| tree | 4c327162e72e49ff549bfb77e77f047f99e005b9 | |
| parent | a90ca1f2b50d343d0d2d0d3bda2ecf7712726419 (diff) | |
| download | UKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.tar.gz UKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.zip  | |
Finished insinuations, started on HTML and CSS
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | app.py | 14 | ||||
| -rw-r--r-- | database.py | 206 | ||||
| -rw-r--r-- | db.env.example | 1 | ||||
| -rw-r--r-- | insinuations.py | 82 | ||||
| -rw-r--r-- | parser.py | 81 | ||||
| -rw-r--r-- | requirements.txt | 4 | ||||
| -rw-r--r-- | static/scripts.js | 1 | ||||
| -rw-r--r-- | static/style.css | 42 | ||||
| -rw-r--r-- | templates/index.html.j2 | 4 | ||||
| -rw-r--r-- | templates/template.html.j2 | 44 | 
11 files changed, 481 insertions, 0 deletions
@@ -1,3 +1,5 @@ +db.env +  # Byte-compiled / optimized / DLL files  __pycache__/  *.py[cod] @@ -0,0 +1,14 @@ +import database +import flask + +app = flask.Flask(__name__) + +@app.route("/") +def serve_index(): +    return flask.render_template( +        "index.html.j2", +        title = "UK Gender Pay Gap" +    ) + +if __name__ == "__main__": +    app.run("0.0.0.0", port = 5005, debug = True)
\ No newline at end of file diff --git a/database.py b/database.py new file mode 100644 index 0000000..26d2683 --- /dev/null +++ b/database.py @@ -0,0 +1,206 @@ +from dataclasses import dataclass +import datetime +import pymysql +import os + +@dataclass +class PayGapDatabase: +     +    host: str = "db" +    user: str = "root" +    passwd: str = None +    db: str = "paygap" +    port: int = 3306 + +    def __enter__(self): +        if self.passwd is None: +            self.passwd = os.environ["MYSQL_ROOT_PASSWORD"] +             +        try: +            self.__connection = self.__get_connection() +        except Exception as e: +            print(e) +            if e.args[0] == 1049: +                self.__connection = self.__build_db() +        return self + +    def __exit__(self, type, value, traceback): +        self.__connection.close() + +    def __get_connection(self): +        return pymysql.connect( +            host = self.host, +            port = self.port, +            user = self.user, +            passwd = self.passwd, +            charset = "utf8mb4", +            database = self.db +        ) + +    def __build_db(self): +        print("Building database...") +        self.__connection = pymysql.connect( +            host = self.host, +            port = self.port, +            user = self.user, +            passwd = self.passwd, +            charset = "utf8mb4", +        ) +        with self.__connection.cursor() as cursor: +            # unsafe: +            cursor.execute("CREATE DATABASE %s" % self.db) +            cursor.execute("USE %s" % self.db) + +            cursor.execute(""" +            CREATE TABLE sic_sections( +                sic_section_id CHAR(1) NOT NULL PRIMARY KEY, +                sic_section_name VARCHAR(128) NOT NULL +            ); +            """) + +            cursor.execute(""" +            CREATE TABLE sic( +                sic_code INT UNSIGNED NOT NULL PRIMARY KEY, +                sic_description VARCHAR(512) NOT NULL, +                sic_section CHAR(1) NOT NULL, +                FOREIGN KEY (sic_section) REFERENCES sic_sections(sic_section_id) +            ); +            """) + +            cursor.execute(""" +            CREATE TABLE employer( +                company_number CHAR(8) NOT NULL PRIMARY KEY, +                name VARCHAR(512) NOT NULL, +                address TEXT NOT NULL, +                postcode VARCHAR(8) NOT NULL, +                policy_link VARCHAR(256) NULL, +                responsible_person VARCHAR(128) NOT NULL, +                size VARCHAR(20) NOT NULL, +                current_name VARCHAR(512) NULL, +                status VARCHAR(32) NULL, +                type_ VARCHAR(128) NULL, +                incorporated DATETIME NULL +            ) +            """) + +            cursor.execute(""" +            CREATE TABLE employer_sic( +                company_number CHAR(8) NOT NULL, +                sic_code INT UNSIGNED NOT NULL, +                PRIMARY KEY (company_number, sic_code), +                FOREIGN KEY (company_number) REFERENCES employer(company_number), +                FOREIGN KEY (sic_code) REFERENCES sic(sic_code) +            ); +            """) + +            cursor.execute(""" +            CREATE TABLE pay( +                pay_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, +                company_number CHAR(8) NOT NULL, +                source VARCHAR(64) NOT NULL, +                date_submitted DATETIME NOT NULL, +                DiffMeanHourlyPercent DECIMAL(8,3) NOT NULL, +                DiffMedianHourlyPercent DECIMAL(8,3) NOT NULL, +                DiffMeanBonusPercent DECIMAL(8,3) NOT NULL, +                DiffMedianBonusPercent DECIMAL(8,3) NOT NULL, +                MaleBonusPercent DECIMAL(8,3) NOT NULL, +                FemaleBonusPercent DECIMAL(8,3) NOT NULL, +                MaleLowerQuartile DECIMAL(8,3) NOT NULL, +                FemaleLowerQuartile DECIMAL(8,3) NOT NULL, +                MaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL, +                FemaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL, +                MaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL, +                FemaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL, +                MaleTopQuartile DECIMAL(8,3) NOT NULL, +                FemaleTopQuartile DECIMAL(8,3) NOT NULL, +                FOREIGN KEY (company_number) REFERENCES employer(company_number) +            ); +            """) + +            self.__connection.commit() +            return self.__connection + +    def append_sic_sections(self, section_id, description): +        # print("Section ID: '%s', Description: '%s'" % (section_id, description)) +        with self.__connection.cursor() as cursor: +            cursor.execute(""" +                INSERT INTO sic_sections VALUES (%s, %s) ON DUPLICATE KEY UPDATE sic_section_name = %s; +            """, (section_id, description, description)) +        self.__connection.commit() + +    def append_sic(self, code, description, section_id): +        print("Appended code %d (%s) under section %s" % (code, description, section_id)) +        with self.__connection.cursor() as cursor: +            cursor.execute(""" +                INSERT INTO sic VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE sic_description = %s, sic_section = %s; +            """, (code, description, section_id, description, section_id)) +        self.__connection.commit() + +    def append_employer(self, company_number, name, address, postcode, policy_link, responsible_person, size, current_name, \ +        status, type_, incorporated, sics): + +        # print("incorporated: %s" % str(incorporated)) +        # print("sics", sics) +        # print("name: %s" % name) +        with self.__connection.cursor() as cursor: +            cursor.execute(""" +            INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)  +            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) +            ON DUPLICATE KEY UPDATE +            name = %s, address = %s, postcode = %s, policy_link = %s, responsible_person = %s, size = %s,  +            current_name = %s, status = %s, type_ = %s, incorporated = %s; +            """, ( +                company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated, +                name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated +            )) +            # sql = """INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)  +            # VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');""" % ( +            #     company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated +            # ) +            # print(sql) + +        self.append_employer_sics(company_number, sics) +        self.__connection.commit() + +    def append_pay_info(self, company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, \ +        diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \ +        female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \ +        female_upper_middle_quartile, male_top_quartile, female_top_quartile): + +        try: +            float(diff_mean_hourly_percent) +        except ValueError: +            diff_mean_hourly_percent = None + + +        with self.__connection.cursor() as cursor: +            cursor.execute("DELETE FROM pay WHERE company_number = %s AND source = %s;", (company_number, source)) + +            try: +                cursor.execute(""" +                INSERT INTO pay (company_number, source, date_submitted, DiffMeanHourlyPercent, DiffMedianHourlyPercent, +                DiffMeanBonusPercent, DiffMedianBonusPercent, MaleBonusPercent, FemaleBonusPercent, MaleLowerQuartile, +                FemaleLowerQuartile, MaleLowerMiddleQuartile, FemaleLowerMiddleQuartile, MaleUpperMiddleQuartile, +                FemaleUpperMiddleQuartile, MaleTopQuartile, FemaleTopQuartile) VALUES ( +                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s +                ); +                """, ( +                    company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, +                    diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, +                    female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, +                    female_upper_middle_quartile, male_top_quartile, female_top_quartile +                )) +            except pymysql.err.DataError: +                return + +        self.__connection.commit() + + +    def append_employer_sics(self, company_number, sics): +        with self.__connection.cursor() as cursor: +            cursor.execute("DELETE FROM employer_sic WHERE company_number = %s", (company_number, )) + +            for sic in sics: +                cursor.execute("SELECT * FROM sic WHERE sic_code = %s", (sic, )) +                if cursor.fetchone() != None: +                    cursor.execute("INSERT INTO employer_sic VALUES (%s, %s);", (company_number, sic)) diff --git a/db.env.example b/db.env.example new file mode 100644 index 0000000..ea8bd2d --- /dev/null +++ b/db.env.example @@ -0,0 +1 @@ +MYSQL_ROOT_PASSWORD=************ diff --git a/insinuations.py b/insinuations.py new file mode 100644 index 0000000..a94607b --- /dev/null +++ b/insinuations.py @@ -0,0 +1,82 @@ +from lxml import html  +import database +import datetime +import requests +import os + +def get_sics(db: database.PayGapDatabase, url = "https://resources.companieshouse.gov.uk/sic/"): +    req = requests.get(url) +    tree = html.fromstring(req.content.decode()) +    bigtable = tree.xpath("/html/body/main/table/tbody")[0] +    for tr_elem in bigtable.getchildren(): +        td_code, td_description = tr_elem + +        if td_code.getchildren() != []: +            # if contains a <strong> element which indicates a section +            current_section_code = td_code.getchildren()[0].text.replace("Section ", "").strip() +            current_section_description = td_description.getchildren()[0].text.strip() + +            db.append_sic_sections(current_section_code, current_section_description) + +        else: +            sic_code = int(td_code.text) +            sic_desc = td_description.text.rstrip() +            db.append_sic(sic_code, sic_desc, current_section_code) + +def get_companyinfo_url(company_number, url = "https://find-and-update.company-information.service.gov.uk/company/%s"): +    if company_number.isdigit(): +        company_number = "%08d" % int(company_number) + +    return url % company_number + +def lookup_company(company_number): +    company = {} +    req = requests.get( +        get_companyinfo_url(company_number), +        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.50.2 (KHTML, like Gecko) Version/5.0.6 Safari/533.22.3'} +    ) + +    if req.status_code not in [200, 404]: +        raise ConnectionError("Couldn't connect- it %d'd. Was looking for company %s" % (req.status_code, company_number)) + +    tree = html.fromstring(req.content.decode()) + +    status_elem = tree.xpath('//*[@id="company-status"]') +    if len(status_elem) == 1: +        company["status"] = status_elem[0].text.strip() +    else: +        company["status"] = None + +    incorp_elem = tree.xpath('//*[@id="company-creation-date"]') +    if len(incorp_elem) == 1: +        company["incorporated"] = datetime.datetime.strptime(incorp_elem[0].text.strip(), "%d %B %Y") +    else: +        company["incorporated"] = None + +    type_elem = tree.xpath('//*[@id="company-type"]') +    if len(type_elem) == 1: +        company["type_"] = type_elem[0].text.strip() +    else: +        company["type_"] = None + +    company["sics"] = set() +    for i in range(9): +        sic_elem = tree.xpath('//*[@id="sic%d"]' % i) +        if len(sic_elem) == 1: +            company["sics"].add(int(sic_elem[0].text.strip().split(" - ")[0])) +        else: +            break + +    return company + +if __name__ == "__main__": +    # if not os.path.exists(".docker"): +    #     import dotenv +    #     dotenv.load_dotenv(dotenv_path = "db.env") +    #     host = "srv.home" +    # else: +    #     host = "db" + +    # with database.PayGapDatabase(host = host) as db: +    #     get_sics(db) +    print(lookup_company("02838054"))
\ No newline at end of file diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2918d73 --- /dev/null +++ b/parser.py @@ -0,0 +1,81 @@ +import insinuations +import database +import datetime +import time +import json +import csv +import sys +import os + +def parse_csv(db, csv_path): +    insinuations.get_sics(db) + +    with open(csv_path, "r") as f: +        num_lines = len(f.readlines()) +    i = 0 + +    with open(csv_path, "r") as f: +        reader = csv.reader(f) +        headers = next(reader) + +        for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \ +            diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \ +            female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \ +            female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \ +            current_name, submitted_after_deadline, duedate, submitted_date in reader: +             +            if company_id.strip() != "": +                try: +                    sic_codes = {int(i.strip()) for i in sic_codes.split(",")} +                except ValueError: +                    sic_codes = set() + +                while True: +                    try: +                        company = insinuations.lookup_company(company_id) +                    except ConnectionError as e: +                        print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e)) +                        time.sleep(20) +                    else: +                        break + +                company["sics"] = company["sics"].union(sic_codes) +                company["company_number"] = company_id +                company["name"] = name +                company["address"] = address +                company["postcode"] = postcode +                company["policy_link"] = policy_link +                company["responsible_person"] = responsible_person +                company["size"] = size +                company["current_name"] = current_name + +                print("%.2f%%" % ((i / num_lines) * 100), company) +                print( +                    company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),  +                    diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,  +                    male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,  +                    male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,  +                    female_upper_middle_quartile, male_top_quartile, female_top_quartile +                ) +                db.append_employer(**company) +                db.append_pay_info( +                    company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),  +                    diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,  +                    male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,  +                    male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,  +                    female_upper_middle_quartile, male_top_quartile, female_top_quartile +                ) +                i += 1 + +                # break + +if __name__ == "__main__": +    if not os.path.exists(".docker"): +        import dotenv +        dotenv.load_dotenv(dotenv_path = "db.env") +        host = "srv.home" +    else: +        host = "db" + +    with database.PayGapDatabase(host = host) as db: +        parse_csv(db, sys.argv[1])
\ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2eff776 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pymysql +lxml +python-dotenv +flask diff --git a/static/scripts.js b/static/scripts.js new file mode 100644 index 0000000..c6c3320 --- /dev/null +++ b/static/scripts.js @@ -0,0 +1 @@ +console.log("foo!")
\ No newline at end of file diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..be2f299 --- /dev/null +++ b/static/style.css @@ -0,0 +1,42 @@ +body { +    font-family: Helvetica, sans-serif; +} + +header { +    font-size: large; +    padding-left: 1%; +    /* font-weight: bold; */ +} + +header p { +    font-size: small; +} + +a { +    color: black; +    font-weight: bold; +    padding-top: 1px; +} + +aside { +    width: 15%; +    padding-left: 15px; +    margin-left: 15px; +    float: right; +    /* border-left-color: rgb(189, 189, 189); +    border-left-width: 2px; +    border-left-style: groove; */ +} + +#main_content { +    padding-left: 2.5%; +    padding-right: 2.5; +} + +footer { +    padding-left: 10%; +    padding-right: 10%; +    padding-top: 50px; +    font-size: xx-small; +    justify-content: center; +}
\ No newline at end of file diff --git a/templates/index.html.j2 b/templates/index.html.j2 new file mode 100644 index 0000000..91f314e --- /dev/null +++ b/templates/index.html.j2 @@ -0,0 +1,4 @@ +{% extends "template.html.j2" %} +{% block content %} +    <p>foo</p> +{% endblock %}
\ No newline at end of file diff --git a/templates/template.html.j2 b/templates/template.html.j2 new file mode 100644 index 0000000..1fb238d --- /dev/null +++ b/templates/template.html.j2 @@ -0,0 +1,44 @@ +<!DOCTYPE HTML> +<html> +<head> +	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +	<title>UK Gender Pay Gap :: {{ title }}</title> +	<!-- JQuery and JQurty UI current version --> +    <script src='https://code.jquery.com/jquery-3.6.0.js'></script> +	<script src="https://code.jquery.com/ui/1.13.1/jquery-ui.js"></script> + +	<!-- Highcharts libraries --> +	<script type="text/javascript" src="https://code.highcharts.com/highcharts.js"></script> +	<script type="text/javascript" src="https://code.highcharts.com/highcharts-more.js"></script> +	<script type="text/javascript" src="https://code.highcharts.com/modules/exporting.js"></script> +	<script type="text/javascript" src="https://code.highcharts.com/modules/export-data.js"></script> +	<script type="text/javascript" src="https://code.highcharts.com/modules/accessibility.js"></script>	 +	 +	<script type="text/javascript" src="https://code.highcharts.com/maps/modules/map.js"></script> +	<script type="text/javascript" src="https://code.highcharts.com/mapdata/custom/world-robinson.js"></script> + +	<!-- local Javascript files --> +	<script type="text/javascript" src="{{ url_for('static', filename='scripts.js') }}"></script> +	<!-- remote and local CSS stylesheet --> +	<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}"> +</head> + +<body> +    <header> +        <h1>{{ title }}</h1> +        <p>Data provided by the <a href="https://gender-pay-gap.service.gov.uk/">UK Government</a></p> +    </header> + +    <aside> +        <h4>Filter...</h4> +    </aside> + +    <div id="main_content"> +        {% block content %} +        {% endblock %} +    </div> + +    <footer> +        <p>Source code released under GPLv3</p> +    </footer> +</body>
\ No newline at end of file  | 
