Model/data_collection/adzuna.py
Khalim Conn-Kowlessar abed2ce2de adzuna wip
2023-06-29 21:08:21 +01:00

86 lines
2.8 KiB
Python

import requests
import json
from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
import pandas as pd
import os
import time
from tqdm import tqdm
"""
Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
-codes-in-the-united-kingdom/explore
"""
constituencies = pd.read_csv(
os.path.abspath(
os.path.dirname(
__file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
"December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
)
constituencies["location_type"] = "constituency"
def retry_api_call(job_title, location, max_retries=10):
for i in range(max_retries):
try:
response = get_adzuna_jobs(job_title, location)
return response
except (requests.HTTPError, requests.ConnectionError):
print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
time.sleep(2)
print(f"Failed after {max_retries} attempts.")
return None
def get_adzuna_jobs(job_title, location):
base_url = "https://api.adzuna.com/v1/api/jobs"
country_code = "gb"
url = f"{base_url}/{country_code}/search/1"
params = {
"app_id": ADZUNA_APP_ID,
"app_key": ADZUNA_API_KEY,
"results_per_page": 25,
"what": job_title,
"where": location,
"content-type": "application/json",
"distance": 10
}
response = requests.get(url, params=params)
response.raise_for_status()
jobs = json.loads(response.text)
return jobs
JOB_TITLES = [
"insulation installer", "internal wall insulation installer", "external wall insulation installer",
"cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
"spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
"iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
]
results = []
for i, job_title in enumerate(JOB_TITLES):
print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
location = location_config["PCON22NM"]
jobs = retry_api_call(job_title, location)
time.sleep(0.5)
if jobs["results"]:
for job in jobs['results']:
to_append = {
"job_title": job_title,
"search_location": location,
"search_location_code": location_config["PCON22CD"],
**job
}
results.append(to_append)
results_df = pd.DataFrame(results)