Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add postcode data ingestion job #5966

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cron-scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
EVERY_THREE_AM,
EVERY_THREE_AM_ON_TWENTY_EIGHTH_EACH_MONTH,
EVERY_TWO_AM,
EVERY_WEEK,
HALF_DAY_IN_SECONDS,
ONE_HOUR_IN_SECONDS,
)
Expand All @@ -57,6 +58,9 @@
schedule_refresh_gross_value_added_value_for_fdi_investment_projects,
)
from datahub.investment_lead.tasks.ingest_eyb_triage import eyb_triage_identification_task
from datahub.metadata.tasks import (
postcode_data_identification_task,
)
from datahub.omis.payment.tasks import refresh_pending_payment_gateway_sessions
from datahub.reminder.migration_tasks import run_ita_users_migration, run_post_users_migration
from datahub.reminder.tasks import (
Expand Down Expand Up @@ -154,6 +158,11 @@ def schedule_jobs():
cron=EVERY_HOUR,
description='Identify new Stova attendee objects and schedule their ingestion',
)
job_scheduler(
function=postcode_data_identification_task,
cron=EVERY_WEEK,
description='Identify new Postcode objects and schedule their ingestion',
)

if settings.ENABLE_ESTIMATED_LAND_DATE_REMINDERS:
job_scheduler(
Expand Down
4 changes: 2 additions & 2 deletions datahub/company_activity/tasks/ingest_stova_attendees.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
def stova_attendee_identification_task() -> None:
"""Identifies the most recent file to be ingested and schedules a task to ingest it"""
logger.info('Stova attendee identification task started.')
identification_task = StovaAttendeeIndentificationTask(prefix=STOVA_ATTENDEE_PREFIX)
identification_task = StovaAttendeeIdentificationTask(prefix=STOVA_ATTENDEE_PREFIX)
identification_task.identify_new_objects(stova_attendee_ingestion_task)
logger.info('Stova attendee identification task finished.')

Expand All @@ -37,7 +37,7 @@ def stova_attendee_ingestion_task(object_key: str) -> None:
logger.info(f'Stova attendee ingestion task finished for file {object_key}.')


class StovaAttendeeIndentificationTask(BaseObjectIdentificationTask):
class StovaAttendeeIdentificationTask(BaseObjectIdentificationTask):
pass


Expand Down
4 changes: 2 additions & 2 deletions datahub/company_activity/tasks/ingest_stova_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def stova_event_identification_task() -> None:
"""Identifies the most recent file to be ingested and schedules a task to ingest it"""
logger.info('Stova event identification task started.')
identification_task = StovaEventIndentificationTask(prefix=STOVA_EVENT_PREFIX)
identification_task = StovaEventIdentificationTask(prefix=STOVA_EVENT_PREFIX)
identification_task.identify_new_objects(stova_event_ingestion_task)
logger.info('Stova event identification task finished.')

Expand All @@ -31,7 +31,7 @@ def stova_event_ingestion_task(object_key: str) -> None:
logger.info(f'Stova event ingestion task finished for file {object_key}.')


class StovaEventIndentificationTask(BaseObjectIdentificationTask):
class StovaEventIdentificationTask(BaseObjectIdentificationTask):
pass


Expand Down
2 changes: 2 additions & 0 deletions datahub/core/queues/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@
THIRTY_MINUTES_IN_SECONDS = 60 * 30
ONE_HOUR_IN_SECONDS = 60 * 60
HALF_DAY_IN_SECONDS = 12 * ONE_HOUR_IN_SECONDS

EVERY_WEEK = '0 0 * * MON'
9 changes: 9 additions & 0 deletions datahub/metadata/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import environ

from datahub.ingest.constants import PREFIX


env = environ.Env()


POSTCODE_DATA_PREFIX = f'{PREFIX}ExportPostcodeDirectory/'
6 changes: 6 additions & 0 deletions datahub/metadata/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CountrySerializer,
ExchangeRateSerializer,
InvestmentProjectStageSerializer,
PostcodeDataSerializer,
SectorSerializer,
ServiceSerializer,
TeamSerializer,
Expand Down Expand Up @@ -104,3 +105,8 @@
)
registry.register(metadata_id='fdi-value', model=models.FDIValue)
registry.register(metadata_id='export-barrier', model=models.ExportBarrierType)
registry.register(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth adding a test to check the serializer returns a Postcode instance in the intended format? For example, if you were to send a GET request to the metadata endpoint, that it returns a list of postcode instances.

metadata_id='postcode-data',
model=models.PostcodeData,
serializer=PostcodeDataSerializer,
)
31 changes: 31 additions & 0 deletions datahub/metadata/migrations/0090_postcodedata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 4.2.17 on 2025-02-19 14:04

from django.db import migrations, models
import django.db.models.deletion
import uuid


class Migration(migrations.Migration):

dependencies = [
('metadata', '0089_add_stova_serivce'),
]

operations = [
migrations.CreateModel(
name='PostcodeData',
fields=[
('disabled_on', models.DateTimeField(blank=True, null=True)),
('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('name', models.TextField(blank=True)),
Copy link
Contributor

@oliverjwroberts oliverjwroberts Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like having both the name and postcode field is somewhat unnecessary, and we should choose one or the other.

However, I'd be keen to get others' opinions on this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is a name in the context of a postcode? Do we have an example of what this field would contain?

('postcode', models.CharField(max_length=255)),
('modified_on', models.DateTimeField(auto_now=True, null=True)),
('publication_date', models.DateTimeField(blank=True, null=True)),
('region', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to='metadata.ukregion')),
],
options={
'ordering': ('name',),
'abstract': False,
},
),
]
15 changes: 15 additions & 0 deletions datahub/metadata/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,18 @@ class FDIValue(BaseOrderedConstantModel):

class ExportBarrierType(BaseOrderedConstantModel):
"""Export barrier type (used for company interactions)."""


class PostcodeData(BaseConstantModel):
"""Postcode data (for the manual addition of a company)."""

postcode = models.CharField(max_length=MAX_LENGTH)
modified_on = models.DateTimeField(auto_now=True, null=True)
postcode = models.ForeignKey(
UKRegion,
blank=True,
null=True,
on_delete=models.SET_NULL,
related_name='+',
)
publication_date = models.DateTimeField(null=True, blank=True)
16 changes: 15 additions & 1 deletion datahub/metadata/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datahub.export_win.models import TeamType
from datahub.interaction.models import ServiceAnswerOption, ServiceQuestion
from datahub.metadata.models import (
Country, ExchangeRate, OverseasRegion, Service, TeamRole, UKRegion,
Country, ExchangeRate, OverseasRegion, PostcodeData, Service, TeamRole, UKRegion,
)


Expand Down Expand Up @@ -127,3 +127,17 @@ class HVCSerializer(ConstantModelSerializer):

campaign_id = serializers.ReadOnlyField()
financial_year = serializers.ReadOnlyField()


class PostcodeDataSerializer(ConstantModelSerializer):
"""Postcode data serializer"""

id = serializers.UUIDField()
postcode = serializers.CharField()
modified_on = serializers.DateTimeField()
postcode_region = NestedRelatedField(UKRegion, read_only=True)
publication_date = serializers.DateTimeField()

class Meta:
model = PostcodeData
fields = '__all__'
86 changes: 86 additions & 0 deletions datahub/metadata/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import logging

from rest_framework import serializers

from datahub.ingest.boto3 import S3ObjectProcessor
from datahub.ingest.tasks import BaseObjectIdentificationTask, BaseObjectIngestionTask
from datahub.metadata.constants import POSTCODE_DATA_PREFIX
from datahub.metadata.models import PostcodeData
from datahub.metadata.serializers import PostcodeDataSerializer


logger = logging.getLogger(__name__)


def postcode_data_identification_task() -> None:
logger.info('Postcode data identification task started...')
identification_task = PostcodeDataIdentificationTask(prefix=POSTCODE_DATA_PREFIX)
identification_task.identify_new_objects(postcode_data_ingestion_task)
logger.info('Postcode data identification task finished.')


class PostcodeDataIdentificationTask(BaseObjectIdentificationTask):
"""Class to identify new postcode data objects and determine if they should be ingested."""


def postcode_data_ingestion_task(object_key: str) -> None:
logger.info('Postcode data ingestion task started...')
ingestion_task = PostcodeDataIngestionTask(
object_key=object_key,
s3_processor=S3ObjectProcessor(prefix=POSTCODE_DATA_PREFIX),
serializer_class=PostcodeDataSerializer,
)
ingestion_task.ingest_object()
logger.info('Postcode data ingestion task finished.')


class PostcodeDataIngestionTask(BaseObjectIngestionTask):
"""Class to ingest a postcode object from S3."""

def __init__(
self,
object_key: str,
s3_processor: S3ObjectProcessor,
serializer_class: serializers.Serializer,
) -> None:
self.serializer_class = serializer_class
super().__init__(object_key, s3_processor)

existing_ids = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for consistency, you can probably set this in the __init__ function, the same way the self.serializer_class is defined. I.e. self.existing_ids = [].

But nice approach loading them into memory first, instead of multiple DB requests!


def _should_process_record(self, record: dict) -> bool:
"""Checks whether the record has already been ingested or not."""
if not self.existing_ids:
self.existing_ids = set(PostcodeData.objects.values_list(
'id', flat=True))

postcode_data_id = record.get('id')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the incoming records have an ID? Maybe we want to ignore these if we are setting our own UUID?

if postcode_data_id in self.existing_ids:
logger.info(f'Record already exists for postcode_data_id: {postcode_data_id}')
return False

return True

def _process_record(self, record: dict) -> None:
"""Processes a single record.

This method should take a single record, update an existing instance,
or create a new one, and return None.
"""
serializer = self.serializer_class(data=record)
if serializer.is_valid():
primary_key = serializer.validated_data.pop('id')
queryset = PostcodeData.objects.filter(pk=primary_key)
instance, created = queryset.update_or_create(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we aren't updating postcode records in the first instance, this update_or_create call is probably redundant. Instead you could do something like:

def _process_record(self, record: dict) -> None):
    serializer = self.serializer_class(data=record)
    if serializer.is_valid():
        serializer.validated_data.pop('id')  # because setting an id from the incoming data may raise an error when we've told Django to auto generate a UUID
        instance = Postcode.objects.create(**serializer.validated_data)
        self.created_ids.append(str(instance.id))
    else:
        self.errors.append({
            'record': record,
            'errors': serializer.errors,
        })

This may also fix some of the test coverage.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why aren't we updating postcode records? Given the point of this exercise is to improve accuracy it seems like we probably should unless there's a reason not? I don't think the data size is prohibitive?

pk=primary_key,
defaults=serializer.validated_data,
)
if created:
self.created_ids.append(str(instance.id))
else:
self.updated_ids.append(str(instance.id))
else:
self.errors.append({
'record': record,
'errors': serializer.errors,
})
35 changes: 35 additions & 0 deletions datahub/metadata/test/factories.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import uuid
from datetime import timezone

from random import randrange, sample

import factory
from faker import Faker

from datahub.core import constants
from datahub.metadata.models import Service

fake = Faker(locale='en_GB')


class ServiceFactory(factory.django.DjangoModelFactory):
"""Service factory."""
Expand Down Expand Up @@ -119,3 +125,32 @@ class AdministrativeAreasFactory(factory.django.DjangoModelFactory):

class Meta:
model = 'metadata.AdministrativeArea'


class PostcodeDataFactory(factory.django.DjangoModelFactory):
"""Postcode data factory"""

postcode = factory.Faker('postcode')
modified_on = '2025-10-08T08:06:53+00:00'
postcode_region = factory.Faker('postcode_region')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value passed into the factory.Faker() call is the name of the faker provider. I don't think there is one called postcode_region. It might be more appropriate to select a random UK region here instead?

publication_date = '2025-02-02T08:08:52+00:00'

class Meta:
model = 'metadata.PostcodeData'


def postcode_data_record_faker(overrides: dict | None = None) -> dict:
data = {
'id': str(uuid.uuid4()),
'postcode': fake.postcode(),
'modified_on': fake.date_time_between(
start_date='-1y', tzinfo=timezone.utc,
),
'publication_date': fake.date_time_between(
start_date='-1y', tzinfo=timezone.utc,
),
'postcode_region': constants.UKRegion.london.name,
}
if overrides:
data.update(overrides)
return data
Loading