From f21d2778af88e851b2a143ea37ee47886c6eafde Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 29 Jun 2023 16:54:27 +0100 Subject: [PATCH] Added retry methodology --- data_collection/adzuna.py | 54 ++++++++++++++++++++++---------- data_collection/requirements.txt | 3 +- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/data_collection/adzuna.py b/data_collection/adzuna.py index 0633024d..53dc9233 100644 --- a/data_collection/adzuna.py +++ b/data_collection/adzuna.py @@ -4,6 +4,8 @@ from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID import pandas as pd import os +import time +from tqdm import tqdm """ Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here: @@ -18,6 +20,20 @@ constituencies = pd.read_csv( "December_2022)_Names_and_Codes_in_the_United_Kingdom.csv" ) +constituencies["location_type"] = "constituency" + + +def retry_api_call(job_title, location, max_retries=10): + for i in range(max_retries): + try: + response = get_adzuna_jobs(job_title, location) + return response + except (requests.HTTPError, requests.ConnectionError): + print(f"Attempt {i + 1} failed. Retrying in 2 seconds...") + time.sleep(2) + print(f"Failed after {max_retries} attempts.") + return None + def get_adzuna_jobs(job_title, location): base_url = "https://api.adzuna.com/v1/api/jobs" @@ -28,20 +44,18 @@ def get_adzuna_jobs(job_title, location): params = { "app_id": ADZUNA_APP_ID, "app_key": ADZUNA_API_KEY, - "results_per_page": 10, # change as needed + "results_per_page": 25, "what": job_title, "where": location, - "content-type": "application/json" + "content-type": "application/json", + "distance": 10 } response = requests.get(url, params=params) + response.raise_for_status() - if response.status_code == 200: - jobs = json.loads(response.text) - return jobs - else: - print(f"Error: {response.status_code}") - return None + jobs = json.loads(response.text) + return jobs JOB_TITLES = [ @@ -51,12 +65,20 @@ JOB_TITLES = [ "iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer", ] -for job_title in JOB_TITLES: - for _, location in constituencies.iterrows(): - jobs = get_adzuna_jobs(job_title, location) - if jobs is not None: +results = [] +for i, job_title in enumerate(JOB_TITLES): + print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES)))) + for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]): + + location = location_config["PCON22NM"] + + time.sleep(0.5) + if jobs["results"]: for job in jobs['results']: - print(job['title']) - print(job['salary_min']) - print(job['salary_max']) - print() + to_append = { + "job_title": job_title, + "location": location, + "location_code": location_config["PCON22CD"], + **job + } + results.append(to_append) diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt index c03fb512..12973ef7 100644 --- a/data_collection/requirements.txt +++ b/data_collection/requirements.txt @@ -1,3 +1,4 @@ requests python-dotenv -pandas \ No newline at end of file +pandas +tqdm \ No newline at end of file