-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add postcode data ingestion job #5966
base: main
Are you sure you want to change the base?
Changes from all commits
e2b0053
fa338dd
e427286
19ad4a8
ea24185
bb6be51
bbc3f8a
33bfecc
341a596
96619a4
3801798
a93145a
2e2f933
3b95edf
7511281
4f4438e
5f4b00c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import environ | ||
|
||
from datahub.ingest.constants import PREFIX | ||
|
||
|
||
env = environ.Env() | ||
|
||
|
||
POSTCODE_DATA_PREFIX = f'{PREFIX}ExportPostcodeDirectory/' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Generated by Django 4.2.17 on 2025-02-19 14:04 | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
import uuid | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('metadata', '0089_add_stova_serivce'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='PostcodeData', | ||
fields=[ | ||
('disabled_on', models.DateTimeField(blank=True, null=True)), | ||
('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), | ||
('name', models.TextField(blank=True)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It feels like having both the However, I'd be keen to get others' opinions on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is a name in the context of a postcode? Do we have an example of what this field would contain? |
||
('postcode', models.CharField(max_length=255)), | ||
('modified_on', models.DateTimeField(auto_now=True, null=True)), | ||
('publication_date', models.DateTimeField(blank=True, null=True)), | ||
('region', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to='metadata.ukregion')), | ||
], | ||
options={ | ||
'ordering': ('name',), | ||
'abstract': False, | ||
}, | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import logging | ||
|
||
from rest_framework import serializers | ||
|
||
from datahub.ingest.boto3 import S3ObjectProcessor | ||
from datahub.ingest.tasks import BaseObjectIdentificationTask, BaseObjectIngestionTask | ||
from datahub.metadata.constants import POSTCODE_DATA_PREFIX | ||
from datahub.metadata.models import PostcodeData | ||
from datahub.metadata.serializers import PostcodeDataSerializer | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def postcode_data_identification_task() -> None: | ||
logger.info('Postcode data identification task started...') | ||
identification_task = PostcodeDataIdentificationTask(prefix=POSTCODE_DATA_PREFIX) | ||
identification_task.identify_new_objects(postcode_data_ingestion_task) | ||
logger.info('Postcode data identification task finished.') | ||
|
||
|
||
class PostcodeDataIdentificationTask(BaseObjectIdentificationTask): | ||
"""Class to identify new postcode data objects and determine if they should be ingested.""" | ||
|
||
|
||
def postcode_data_ingestion_task(object_key: str) -> None: | ||
logger.info('Postcode data ingestion task started...') | ||
ingestion_task = PostcodeDataIngestionTask( | ||
object_key=object_key, | ||
s3_processor=S3ObjectProcessor(prefix=POSTCODE_DATA_PREFIX), | ||
serializer_class=PostcodeDataSerializer, | ||
) | ||
ingestion_task.ingest_object() | ||
logger.info('Postcode data ingestion task finished.') | ||
|
||
|
||
class PostcodeDataIngestionTask(BaseObjectIngestionTask): | ||
"""Class to ingest a postcode object from S3.""" | ||
|
||
def __init__( | ||
self, | ||
object_key: str, | ||
s3_processor: S3ObjectProcessor, | ||
serializer_class: serializers.Serializer, | ||
) -> None: | ||
self.serializer_class = serializer_class | ||
super().__init__(object_key, s3_processor) | ||
|
||
existing_ids = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for consistency, you can probably set this in the But nice approach loading them into memory first, instead of multiple DB requests! |
||
|
||
def _should_process_record(self, record: dict) -> bool: | ||
"""Checks whether the record has already been ingested or not.""" | ||
if not self.existing_ids: | ||
self.existing_ids = set(PostcodeData.objects.values_list( | ||
'id', flat=True)) | ||
|
||
postcode_data_id = record.get('id') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will the incoming records have an ID? Maybe we want to ignore these if we are setting our own UUID? |
||
if postcode_data_id in self.existing_ids: | ||
logger.info(f'Record already exists for postcode_data_id: {postcode_data_id}') | ||
return False | ||
|
||
return True | ||
|
||
def _process_record(self, record: dict) -> None: | ||
"""Processes a single record. | ||
|
||
This method should take a single record, update an existing instance, | ||
or create a new one, and return None. | ||
""" | ||
serializer = self.serializer_class(data=record) | ||
if serializer.is_valid(): | ||
primary_key = serializer.validated_data.pop('id') | ||
queryset = PostcodeData.objects.filter(pk=primary_key) | ||
instance, created = queryset.update_or_create( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because we aren't updating postcode records in the first instance, this def _process_record(self, record: dict) -> None):
serializer = self.serializer_class(data=record)
if serializer.is_valid():
serializer.validated_data.pop('id') # because setting an id from the incoming data may raise an error when we've told Django to auto generate a UUID
instance = Postcode.objects.create(**serializer.validated_data)
self.created_ids.append(str(instance.id))
else:
self.errors.append({
'record': record,
'errors': serializer.errors,
}) This may also fix some of the test coverage. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why aren't we updating postcode records? Given the point of this exercise is to improve accuracy it seems like we probably should unless there's a reason not? I don't think the data size is prohibitive? |
||
pk=primary_key, | ||
defaults=serializer.validated_data, | ||
) | ||
if created: | ||
self.created_ids.append(str(instance.id)) | ||
else: | ||
self.updated_ids.append(str(instance.id)) | ||
else: | ||
self.errors.append({ | ||
'record': record, | ||
'errors': serializer.errors, | ||
}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
import uuid | ||
from datetime import timezone | ||
|
||
from random import randrange, sample | ||
|
||
import factory | ||
from faker import Faker | ||
|
||
from datahub.core import constants | ||
from datahub.metadata.models import Service | ||
|
||
fake = Faker(locale='en_GB') | ||
|
||
|
||
class ServiceFactory(factory.django.DjangoModelFactory): | ||
"""Service factory.""" | ||
|
@@ -119,3 +125,32 @@ class AdministrativeAreasFactory(factory.django.DjangoModelFactory): | |
|
||
class Meta: | ||
model = 'metadata.AdministrativeArea' | ||
|
||
|
||
class PostcodeDataFactory(factory.django.DjangoModelFactory): | ||
"""Postcode data factory""" | ||
|
||
postcode = factory.Faker('postcode') | ||
modified_on = '2025-10-08T08:06:53+00:00' | ||
postcode_region = factory.Faker('postcode_region') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The value passed into the |
||
publication_date = '2025-02-02T08:08:52+00:00' | ||
|
||
class Meta: | ||
model = 'metadata.PostcodeData' | ||
|
||
|
||
def postcode_data_record_faker(overrides: dict | None = None) -> dict: | ||
data = { | ||
'id': str(uuid.uuid4()), | ||
'postcode': fake.postcode(), | ||
'modified_on': fake.date_time_between( | ||
start_date='-1y', tzinfo=timezone.utc, | ||
), | ||
'publication_date': fake.date_time_between( | ||
start_date='-1y', tzinfo=timezone.utc, | ||
), | ||
'postcode_region': constants.UKRegion.london.name, | ||
} | ||
if overrides: | ||
data.update(overrides) | ||
return data |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worth adding a test to check the serializer returns a Postcode instance in the intended format? For example, if you were to send a GET request to the metadata endpoint, that it returns a list of postcode instances.