From 5def5335b0fc0fc7ab8166cd8e79ab011c956ff1 Mon Sep 17 00:00:00 2001 From: Murilo Dal Ri Date: Wed, 5 Jun 2024 15:57:37 +0100 Subject: [PATCH] Lower batch size on ETL tasks When running the ETL rake tasks we sometimes might run into errors because the pods run out of memory. Lowering the batch size should hopefully help avoid those errors. --- app/domain/etl/feedex/processor.rb | 2 +- app/domain/etl/ga/internal_search_processor.rb | 2 +- app/domain/etl/ga/user_feedback_processor.rb | 2 +- app/domain/etl/ga/views_and_navigation_processor.rb | 2 +- app/domain/etl/main/metrics_processor.rb | 2 +- spec/integration/master/daily_metrics_spec.rb | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/app/domain/etl/feedex/processor.rb b/app/domain/etl/feedex/processor.rb index b28c8f713..caefcc615 100644 --- a/app/domain/etl/feedex/processor.rb +++ b/app/domain/etl/feedex/processor.rb @@ -18,7 +18,7 @@ def process private - BATCH_SIZE = 10_000 + BATCH_SIZE = 5_000 def extract_events batch = 1 diff --git a/app/domain/etl/ga/internal_search_processor.rb b/app/domain/etl/ga/internal_search_processor.rb index 8ae9cafb1..b8378dc28 100644 --- a/app/domain/etl/ga/internal_search_processor.rb +++ b/app/domain/etl/ga/internal_search_processor.rb @@ -25,7 +25,7 @@ def extract_events batch = 1 Etl::GA::InternalSearchService.find_in_batches(date:) do |events| log process: :ga, message: "Processing #{events.length} events in batch #{batch}" - Events::GA.import(events, batch_size: 10_000) + Events::GA.import(events, batch_size: 5_000) batch += 1 end end diff --git a/app/domain/etl/ga/user_feedback_processor.rb b/app/domain/etl/ga/user_feedback_processor.rb index deca31979..ebdf993e9 100644 --- a/app/domain/etl/ga/user_feedback_processor.rb +++ b/app/domain/etl/ga/user_feedback_processor.rb @@ -25,7 +25,7 @@ def extract_events batch = 1 Etl::GA::UserFeedbackService.find_in_batches(date:) do |events| log process: :ga, message: "Processing #{events.length} events in batch #{batch}" - Events::GA.import(events, batch_size: 10_000) + Events::GA.import(events, batch_size: 5_000) batch += 1 end end diff --git a/app/domain/etl/ga/views_and_navigation_processor.rb b/app/domain/etl/ga/views_and_navigation_processor.rb index b11cb3346..21408278f 100644 --- a/app/domain/etl/ga/views_and_navigation_processor.rb +++ b/app/domain/etl/ga/views_and_navigation_processor.rb @@ -25,7 +25,7 @@ def extract_events batch = 1 Etl::GA::ViewsAndNavigationService.find_in_batches(date:) do |events| log process: :ga, message: "Processing #{events.length} events in batch #{batch}" - Events::GA.import(events, batch_size: 10_000) + Events::GA.import(events, batch_size: 5_000) batch += 1 end end diff --git a/app/domain/etl/main/metrics_processor.rb b/app/domain/etl/main/metrics_processor.rb index 89703e30f..193585888 100644 --- a/app/domain/etl/main/metrics_processor.rb +++ b/app/domain/etl/main/metrics_processor.rb @@ -27,7 +27,7 @@ def create_metrics log process: :metrics, message: "about to get the Dimensions::Date" dimensions_date = Dimensions::Date.find_existing_or_create(date) log process: :metrics, message: "got the Dimensions::Date" - Dimensions::Edition.live.find_in_batches(batch_size: 10_000) + Dimensions::Edition.live.find_in_batches(batch_size: 5_000) .with_index do |batch, index| log process: :metrics, message: "processing #{batch.length} items in batch #{index}" values = batch.pluck(:id).map do |value| diff --git a/spec/integration/master/daily_metrics_spec.rb b/spec/integration/master/daily_metrics_spec.rb index 7fe1aea3c..914092ce8 100644 --- a/spec/integration/master/daily_metrics_spec.rb +++ b/spec/integration/master/daily_metrics_spec.rb @@ -170,7 +170,7 @@ def stub_feedex_response 'page_size': 3, }.to_json - stub_request(:get, "http://support-api.dev.gov.uk/feedback-by-day/#{yesterday}?page=1&per_page=10000") + stub_request(:get, "http://support-api.dev.gov.uk/feedback-by-day/#{yesterday}?page=1&per_page=5000") .to_return(status: 200, body: response, headers: {}) end end