Skip to content

Commit

Permalink
Add comments. Get export base directory from configuration.
Browse files Browse the repository at this point in the history
  • Loading branch information
sarkikos committed Aug 9, 2024
1 parent 39c5eed commit c0661b7
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 10 deletions.
40 changes: 34 additions & 6 deletions aspnetcore/src/Exporter/Exporter.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using AutoMapper;
using Nest;
using CSC.PublicApi.ElasticService;
using Microsoft.Extensions.Configuration;
using CSC.PublicApi.Service.Models.FundingCall;
using CSC.PublicApi.Service.Models.FundingDecision;
using CSC.PublicApi.Service.Models.ResearchDataset;
Expand All @@ -19,19 +20,32 @@ public class Exporter
private IElasticClient _elasticClient;
private readonly IMapper _mapper;
private readonly IndexNameSettings _indexNameSettings;
private readonly IConfiguration _configuration;
private const int SingleQueryResultLimit = 1000;
private const string ExportBaseDirectory = "/tmp";
private string? ExportBaseDirectory = "";

public Exporter(IElasticClient elasticClient, IMapper mapper, IndexNameSettings indexNameSettings)
public Exporter(IElasticClient elasticClient, IConfiguration configuration, IMapper mapper, IndexNameSettings indexNameSettings)
{
_elasticClient = elasticClient;
_configuration = configuration;
_mapper = mapper;
_indexNameSettings = indexNameSettings;

}

// Get export base directory from configuration
ExportBaseDirectory = _configuration["EXPORTER:BASEDIRECTORY"];
if (ExportBaseDirectory == null)
{
string errorMessage = $"Export: Failed: could not set export target directory from configuration (EXPORTER:BASEDIRECTORY)";
Console.WriteLine(errorMessage);
throw new InvalidOperationException(errorMessage);
}
else {
Console.WriteLine($"Export: target directory set to '{ExportBaseDirectory}' from configuration (EXPORTER:BASEDIRECTORY)");
}
}


// Construct export file name including full path
private string GetFilename(string modelTypeFullName, long exportFileNumber)
{
string exportFileNumberPaddedString = exportFileNumber.ToString("D10");
Expand All @@ -51,14 +65,28 @@ private string GetFilename(string modelTypeFullName, long exportFileNumber)
fileTypeString = "publication";
break;
}
return $"{ExportBaseDirectory}/{fileTypeString}-{exportFileNumberPaddedString}.json";
return $"{ExportBaseDirectory}{Path.DirectorySeparatorChar}{fileTypeString}-{exportFileNumberPaddedString}.json";
}


/*
* Export data from Elasticsearch index into json text files
* - Get list of configured Elasticsearch indexes
* - For each index, get all documents and
* - Convert them from Elasticsearch model to API model, which ensures the json files will contain the same fields as the Public API endpoint
* - Construct export file name and path
* - Write data to json file
* - To bypass Elasticsearch limitation of 10000 result set, the "search after" feature is utilized
* - https://www.elastic.co/guide/en/elasticsearch/reference/7.17/paginate-search-results.html#search-after
* - Data is queried in smaller chunks, sorted by DocumentIndexOrder
* - This is the most efficient way to sort documents
* - Last hit of previous query is stored
* - New query will always contain "search after" section containing the last hit from previous query
*/
public void Export(JsonSerializerOptions serializerOptions)
{
// Get Elasticsearch indexes and process them
var configuredTypesAndIndexNames = _indexNameSettings.GetTypesAndIndexNames();

foreach (var (indexName, modelType) in configuredTypesAndIndexNames)
{
long numberOfDocumentsInIndex = 0;
Expand Down
13 changes: 9 additions & 4 deletions aspnetcore/src/Exporter/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;

/*
* This application exports all documents from Elasticsearch index into json files. Data is converted from Elasticsearch model to API model.
* The purpose of this application is to enable bulk export of all data. Application is not intended to be executed automatically in production.
* Instead, it should be considered as a tool, which a developer can use when a full data dump is needed in json file format.
*
* The application uses the same configuration as Indexer and Interface applications, except that it requires an added parameter:
* EXPORTER:BASEDIRECTORY - sets the base directory where the json files are written. It must be defined without trailing slash, for example, "/tmp"
*/

public class Program
{
private const int DefaultQueryTimeout = 300;
Expand All @@ -22,8 +31,6 @@ public static async Task Main(string[] args)
.AddUserSecrets<Program>()
.AddEnvironmentVariables()
.Build();

// Create and configure the host to support dependency injection, configuration, etc.
var consoleHost = CreateHostBuilder(args).Build();

// Define json serializer options
Expand Down Expand Up @@ -54,9 +61,7 @@ private static IHostBuilder CreateHostBuilder(string[] args) => Host
services.AddScoped<IElasticSearchIndexService, ElasticSearchIndexService>();
})
.ConfigureHostConfiguration(configurationBuilder => configurationBuilder
// Most of the configuration comes from environment variables.
.AddEnvironmentVariables()
// For local dev we get configuration from user secrets.
.AddUserSecrets(typeof(Program).Assembly, true)
.Build());
}

0 comments on commit c0661b7

Please sign in to comment.