aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2023-04-28 12:03:19 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2023-04-28 12:03:19 +0100
commit43ba613b9e1ea9dbf6f94361d43418c91c8c0785 (patch)
tree4c327162e72e49ff549bfb77e77f047f99e005b9
parenta90ca1f2b50d343d0d2d0d3bda2ecf7712726419 (diff)
downloadUKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.tar.gz
UKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.zip
Finished insinuations, started on HTML and CSS
-rw-r--r--.gitignore2
-rw-r--r--app.py14
-rw-r--r--database.py206
-rw-r--r--db.env.example1
-rw-r--r--insinuations.py82
-rw-r--r--parser.py81
-rw-r--r--requirements.txt4
-rw-r--r--static/scripts.js1
-rw-r--r--static/style.css42
-rw-r--r--templates/index.html.j24
-rw-r--r--templates/template.html.j244
11 files changed, 481 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index b6e4761..fc50f43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+db.env
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..ecb4426
--- /dev/null
+++ b/app.py
@@ -0,0 +1,14 @@
+import database
+import flask
+
+app = flask.Flask(__name__)
+
+@app.route("/")
+def serve_index():
+ return flask.render_template(
+ "index.html.j2",
+ title = "UK Gender Pay Gap"
+ )
+
+if __name__ == "__main__":
+ app.run("0.0.0.0", port = 5005, debug = True) \ No newline at end of file
diff --git a/database.py b/database.py
new file mode 100644
index 0000000..26d2683
--- /dev/null
+++ b/database.py
@@ -0,0 +1,206 @@
+from dataclasses import dataclass
+import datetime
+import pymysql
+import os
+
+@dataclass
+class PayGapDatabase:
+
+ host: str = "db"
+ user: str = "root"
+ passwd: str = None
+ db: str = "paygap"
+ port: int = 3306
+
+ def __enter__(self):
+ if self.passwd is None:
+ self.passwd = os.environ["MYSQL_ROOT_PASSWORD"]
+
+ try:
+ self.__connection = self.__get_connection()
+ except Exception as e:
+ print(e)
+ if e.args[0] == 1049:
+ self.__connection = self.__build_db()
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.__connection.close()
+
+ def __get_connection(self):
+ return pymysql.connect(
+ host = self.host,
+ port = self.port,
+ user = self.user,
+ passwd = self.passwd,
+ charset = "utf8mb4",
+ database = self.db
+ )
+
+ def __build_db(self):
+ print("Building database...")
+ self.__connection = pymysql.connect(
+ host = self.host,
+ port = self.port,
+ user = self.user,
+ passwd = self.passwd,
+ charset = "utf8mb4",
+ )
+ with self.__connection.cursor() as cursor:
+ # unsafe:
+ cursor.execute("CREATE DATABASE %s" % self.db)
+ cursor.execute("USE %s" % self.db)
+
+ cursor.execute("""
+ CREATE TABLE sic_sections(
+ sic_section_id CHAR(1) NOT NULL PRIMARY KEY,
+ sic_section_name VARCHAR(128) NOT NULL
+ );
+ """)
+
+ cursor.execute("""
+ CREATE TABLE sic(
+ sic_code INT UNSIGNED NOT NULL PRIMARY KEY,
+ sic_description VARCHAR(512) NOT NULL,
+ sic_section CHAR(1) NOT NULL,
+ FOREIGN KEY (sic_section) REFERENCES sic_sections(sic_section_id)
+ );
+ """)
+
+ cursor.execute("""
+ CREATE TABLE employer(
+ company_number CHAR(8) NOT NULL PRIMARY KEY,
+ name VARCHAR(512) NOT NULL,
+ address TEXT NOT NULL,
+ postcode VARCHAR(8) NOT NULL,
+ policy_link VARCHAR(256) NULL,
+ responsible_person VARCHAR(128) NOT NULL,
+ size VARCHAR(20) NOT NULL,
+ current_name VARCHAR(512) NULL,
+ status VARCHAR(32) NULL,
+ type_ VARCHAR(128) NULL,
+ incorporated DATETIME NULL
+ )
+ """)
+
+ cursor.execute("""
+ CREATE TABLE employer_sic(
+ company_number CHAR(8) NOT NULL,
+ sic_code INT UNSIGNED NOT NULL,
+ PRIMARY KEY (company_number, sic_code),
+ FOREIGN KEY (company_number) REFERENCES employer(company_number),
+ FOREIGN KEY (sic_code) REFERENCES sic(sic_code)
+ );
+ """)
+
+ cursor.execute("""
+ CREATE TABLE pay(
+ pay_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ company_number CHAR(8) NOT NULL,
+ source VARCHAR(64) NOT NULL,
+ date_submitted DATETIME NOT NULL,
+ DiffMeanHourlyPercent DECIMAL(8,3) NOT NULL,
+ DiffMedianHourlyPercent DECIMAL(8,3) NOT NULL,
+ DiffMeanBonusPercent DECIMAL(8,3) NOT NULL,
+ DiffMedianBonusPercent DECIMAL(8,3) NOT NULL,
+ MaleBonusPercent DECIMAL(8,3) NOT NULL,
+ FemaleBonusPercent DECIMAL(8,3) NOT NULL,
+ MaleLowerQuartile DECIMAL(8,3) NOT NULL,
+ FemaleLowerQuartile DECIMAL(8,3) NOT NULL,
+ MaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL,
+ FemaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL,
+ MaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL,
+ FemaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL,
+ MaleTopQuartile DECIMAL(8,3) NOT NULL,
+ FemaleTopQuartile DECIMAL(8,3) NOT NULL,
+ FOREIGN KEY (company_number) REFERENCES employer(company_number)
+ );
+ """)
+
+ self.__connection.commit()
+ return self.__connection
+
+ def append_sic_sections(self, section_id, description):
+ # print("Section ID: '%s', Description: '%s'" % (section_id, description))
+ with self.__connection.cursor() as cursor:
+ cursor.execute("""
+ INSERT INTO sic_sections VALUES (%s, %s) ON DUPLICATE KEY UPDATE sic_section_name = %s;
+ """, (section_id, description, description))
+ self.__connection.commit()
+
+ def append_sic(self, code, description, section_id):
+ print("Appended code %d (%s) under section %s" % (code, description, section_id))
+ with self.__connection.cursor() as cursor:
+ cursor.execute("""
+ INSERT INTO sic VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE sic_description = %s, sic_section = %s;
+ """, (code, description, section_id, description, section_id))
+ self.__connection.commit()
+
+ def append_employer(self, company_number, name, address, postcode, policy_link, responsible_person, size, current_name, \
+ status, type_, incorporated, sics):
+
+ # print("incorporated: %s" % str(incorporated))
+ # print("sics", sics)
+ # print("name: %s" % name)
+ with self.__connection.cursor() as cursor:
+ cursor.execute("""
+ INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+ ON DUPLICATE KEY UPDATE
+ name = %s, address = %s, postcode = %s, policy_link = %s, responsible_person = %s, size = %s,
+ current_name = %s, status = %s, type_ = %s, incorporated = %s;
+ """, (
+ company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated,
+ name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated
+ ))
+ # sql = """INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)
+ # VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');""" % (
+ # company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated
+ # )
+ # print(sql)
+
+ self.append_employer_sics(company_number, sics)
+ self.__connection.commit()
+
+ def append_pay_info(self, company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, \
+ diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
+ female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile):
+
+ try:
+ float(diff_mean_hourly_percent)
+ except ValueError:
+ diff_mean_hourly_percent = None
+
+
+ with self.__connection.cursor() as cursor:
+ cursor.execute("DELETE FROM pay WHERE company_number = %s AND source = %s;", (company_number, source))
+
+ try:
+ cursor.execute("""
+ INSERT INTO pay (company_number, source, date_submitted, DiffMeanHourlyPercent, DiffMedianHourlyPercent,
+ DiffMeanBonusPercent, DiffMedianBonusPercent, MaleBonusPercent, FemaleBonusPercent, MaleLowerQuartile,
+ FemaleLowerQuartile, MaleLowerMiddleQuartile, FemaleLowerMiddleQuartile, MaleUpperMiddleQuartile,
+ FemaleUpperMiddleQuartile, MaleTopQuartile, FemaleTopQuartile) VALUES (
+ %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
+ );
+ """, (
+ company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent,
+ diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile,
+ female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ ))
+ except pymysql.err.DataError:
+ return
+
+ self.__connection.commit()
+
+
+ def append_employer_sics(self, company_number, sics):
+ with self.__connection.cursor() as cursor:
+ cursor.execute("DELETE FROM employer_sic WHERE company_number = %s", (company_number, ))
+
+ for sic in sics:
+ cursor.execute("SELECT * FROM sic WHERE sic_code = %s", (sic, ))
+ if cursor.fetchone() != None:
+ cursor.execute("INSERT INTO employer_sic VALUES (%s, %s);", (company_number, sic))
diff --git a/db.env.example b/db.env.example
new file mode 100644
index 0000000..ea8bd2d
--- /dev/null
+++ b/db.env.example
@@ -0,0 +1 @@
+MYSQL_ROOT_PASSWORD=************
diff --git a/insinuations.py b/insinuations.py
new file mode 100644
index 0000000..a94607b
--- /dev/null
+++ b/insinuations.py
@@ -0,0 +1,82 @@
+from lxml import html
+import database
+import datetime
+import requests
+import os
+
+def get_sics(db: database.PayGapDatabase, url = "https://resources.companieshouse.gov.uk/sic/"):
+ req = requests.get(url)
+ tree = html.fromstring(req.content.decode())
+ bigtable = tree.xpath("/html/body/main/table/tbody")[0]
+ for tr_elem in bigtable.getchildren():
+ td_code, td_description = tr_elem
+
+ if td_code.getchildren() != []:
+ # if contains a <strong> element which indicates a section
+ current_section_code = td_code.getchildren()[0].text.replace("Section ", "").strip()
+ current_section_description = td_description.getchildren()[0].text.strip()
+
+ db.append_sic_sections(current_section_code, current_section_description)
+
+ else:
+ sic_code = int(td_code.text)
+ sic_desc = td_description.text.rstrip()
+ db.append_sic(sic_code, sic_desc, current_section_code)
+
+def get_companyinfo_url(company_number, url = "https://find-and-update.company-information.service.gov.uk/company/%s"):
+ if company_number.isdigit():
+ company_number = "%08d" % int(company_number)
+
+ return url % company_number
+
+def lookup_company(company_number):
+ company = {}
+ req = requests.get(
+ get_companyinfo_url(company_number),
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.50.2 (KHTML, like Gecko) Version/5.0.6 Safari/533.22.3'}
+ )
+
+ if req.status_code not in [200, 404]:
+ raise ConnectionError("Couldn't connect- it %d'd. Was looking for company %s" % (req.status_code, company_number))
+
+ tree = html.fromstring(req.content.decode())
+
+ status_elem = tree.xpath('//*[@id="company-status"]')
+ if len(status_elem) == 1:
+ company["status"] = status_elem[0].text.strip()
+ else:
+ company["status"] = None
+
+ incorp_elem = tree.xpath('//*[@id="company-creation-date"]')
+ if len(incorp_elem) == 1:
+ company["incorporated"] = datetime.datetime.strptime(incorp_elem[0].text.strip(), "%d %B %Y")
+ else:
+ company["incorporated"] = None
+
+ type_elem = tree.xpath('//*[@id="company-type"]')
+ if len(type_elem) == 1:
+ company["type_"] = type_elem[0].text.strip()
+ else:
+ company["type_"] = None
+
+ company["sics"] = set()
+ for i in range(9):
+ sic_elem = tree.xpath('//*[@id="sic%d"]' % i)
+ if len(sic_elem) == 1:
+ company["sics"].add(int(sic_elem[0].text.strip().split(" - ")[0]))
+ else:
+ break
+
+ return company
+
+if __name__ == "__main__":
+ # if not os.path.exists(".docker"):
+ # import dotenv
+ # dotenv.load_dotenv(dotenv_path = "db.env")
+ # host = "srv.home"
+ # else:
+ # host = "db"
+
+ # with database.PayGapDatabase(host = host) as db:
+ # get_sics(db)
+ print(lookup_company("02838054")) \ No newline at end of file
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..2918d73
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,81 @@
+import insinuations
+import database
+import datetime
+import time
+import json
+import csv
+import sys
+import os
+
+def parse_csv(db, csv_path):
+ insinuations.get_sics(db)
+
+ with open(csv_path, "r") as f:
+ num_lines = len(f.readlines())
+ i = 0
+
+ with open(csv_path, "r") as f:
+ reader = csv.reader(f)
+ headers = next(reader)
+
+ for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \
+ diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
+ female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \
+ current_name, submitted_after_deadline, duedate, submitted_date in reader:
+
+ if company_id.strip() != "":
+ try:
+ sic_codes = {int(i.strip()) for i in sic_codes.split(",")}
+ except ValueError:
+ sic_codes = set()
+
+ while True:
+ try:
+ company = insinuations.lookup_company(company_id)
+ except ConnectionError as e:
+ print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e))
+ time.sleep(20)
+ else:
+ break
+
+ company["sics"] = company["sics"].union(sic_codes)
+ company["company_number"] = company_id
+ company["name"] = name
+ company["address"] = address
+ company["postcode"] = postcode
+ company["policy_link"] = policy_link
+ company["responsible_person"] = responsible_person
+ company["size"] = size
+ company["current_name"] = current_name
+
+ print("%.2f%%" % ((i / num_lines) * 100), company)
+ print(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ db.append_employer(**company)
+ db.append_pay_info(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ i += 1
+
+ # break
+
+if __name__ == "__main__":
+ if not os.path.exists(".docker"):
+ import dotenv
+ dotenv.load_dotenv(dotenv_path = "db.env")
+ host = "srv.home"
+ else:
+ host = "db"
+
+ with database.PayGapDatabase(host = host) as db:
+ parse_csv(db, sys.argv[1]) \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2eff776
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+pymysql
+lxml
+python-dotenv
+flask
diff --git a/static/scripts.js b/static/scripts.js
new file mode 100644
index 0000000..c6c3320
--- /dev/null
+++ b/static/scripts.js
@@ -0,0 +1 @@
+console.log("foo!") \ No newline at end of file
diff --git a/static/style.css b/static/style.css
new file mode 100644
index 0000000..be2f299
--- /dev/null
+++ b/static/style.css
@@ -0,0 +1,42 @@
+body {
+ font-family: Helvetica, sans-serif;
+}
+
+header {
+ font-size: large;
+ padding-left: 1%;
+ /* font-weight: bold; */
+}
+
+header p {
+ font-size: small;
+}
+
+a {
+ color: black;
+ font-weight: bold;
+ padding-top: 1px;
+}
+
+aside {
+ width: 15%;
+ padding-left: 15px;
+ margin-left: 15px;
+ float: right;
+ /* border-left-color: rgb(189, 189, 189);
+ border-left-width: 2px;
+ border-left-style: groove; */
+}
+
+#main_content {
+ padding-left: 2.5%;
+ padding-right: 2.5;
+}
+
+footer {
+ padding-left: 10%;
+ padding-right: 10%;
+ padding-top: 50px;
+ font-size: xx-small;
+ justify-content: center;
+} \ No newline at end of file
diff --git a/templates/index.html.j2 b/templates/index.html.j2
new file mode 100644
index 0000000..91f314e
--- /dev/null
+++ b/templates/index.html.j2
@@ -0,0 +1,4 @@
+{% extends "template.html.j2" %}
+{% block content %}
+ <p>foo</p>
+{% endblock %} \ No newline at end of file
diff --git a/templates/template.html.j2 b/templates/template.html.j2
new file mode 100644
index 0000000..1fb238d
--- /dev/null
+++ b/templates/template.html.j2
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML>
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <title>UK Gender Pay Gap :: {{ title }}</title>
+ <!-- JQuery and JQurty UI current version -->
+ <script src='https://code.jquery.com/jquery-3.6.0.js'></script>
+ <script src="https://code.jquery.com/ui/1.13.1/jquery-ui.js"></script>
+
+ <!-- Highcharts libraries -->
+ <script type="text/javascript" src="https://code.highcharts.com/highcharts.js"></script>
+ <script type="text/javascript" src="https://code.highcharts.com/highcharts-more.js"></script>
+ <script type="text/javascript" src="https://code.highcharts.com/modules/exporting.js"></script>
+ <script type="text/javascript" src="https://code.highcharts.com/modules/export-data.js"></script>
+ <script type="text/javascript" src="https://code.highcharts.com/modules/accessibility.js"></script>
+
+ <script type="text/javascript" src="https://code.highcharts.com/maps/modules/map.js"></script>
+ <script type="text/javascript" src="https://code.highcharts.com/mapdata/custom/world-robinson.js"></script>
+
+ <!-- local Javascript files -->
+ <script type="text/javascript" src="{{ url_for('static', filename='scripts.js') }}"></script>
+ <!-- remote and local CSS stylesheet -->
+ <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+</head>
+
+<body>
+ <header>
+ <h1>{{ title }}</h1>
+ <p>Data provided by the <a href="https://gender-pay-gap.service.gov.uk/">UK Government</a></p>
+ </header>
+
+ <aside>
+ <h4>Filter...</h4>
+ </aside>
+
+ <div id="main_content">
+ {% block content %}
+ {% endblock %}
+ </div>
+
+ <footer>
+ <p>Source code released under GPLv3</p>
+ </footer>
+</body> \ No newline at end of file