From e765d864babb2fd08356c3eba9d3adf1c1d2c1c3 Mon Sep 17 00:00:00 2001 From: Webster Sheets Date: Tue, 10 Dec 2019 01:33:39 -0500 Subject: [PATCH] Add tools/hyg-database parser.cpp All of the parsing code is implemented except Bayer-Flamsteed names Still need to arrange writing JSON / CBOR data and applying compression Implement everything but LZ4 compression Friendly reminder that CBOR is network byte order encoded, AKA big-endian Validation, parsing, JSON and CBOR all working Only thing remaining is to handle LZ4 de/compression Split cbor writing to its own header --- CMakeLists.txt | 3 +- contrib/csv-parser/CMakeLists.txt | 5 - tools/CMakeLists.txt | 20 ++ tools/basic_cbor.h | 105 +++++++++ tools/hyg-database-to-json.cpp | 378 ++++++++++++++++++++++++++++++ 5 files changed, 505 insertions(+), 6 deletions(-) delete mode 100644 contrib/csv-parser/CMakeLists.txt create mode 100644 tools/CMakeLists.txt create mode 100644 tools/basic_cbor.h create mode 100644 tools/hyg-database-to-json.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 03f488ccd65..0c327a508df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -218,7 +218,6 @@ endif (MSVC) find_package(Freetype REQUIRED) find_package(OpenGL REQUIRED) -add_subdirectory(contrib/csv-parser) add_subdirectory(contrib/lz4) add_subdirectory(contrib/fmt) @@ -334,6 +333,8 @@ target_link_libraries(${PROJECT_NAME} LINK_PRIVATE ${pioneerLibs} ${winLibs}) target_link_libraries(modelcompiler LINK_PRIVATE ${pioneerLibs} ${winLibs}) target_link_libraries(savegamedump LINK_PRIVATE pioneer-core ${SDL2_IMAGE_LIBRARIES} ${winLibs}) +add_subdirectory(tools) + set_cxx11_properties(${PROJECT_NAME} modelcompiler savegamedump) if(MSVC) diff --git a/contrib/csv-parser/CMakeLists.txt b/contrib/csv-parser/CMakeLists.txt deleted file mode 100644 index 2f9c04e6f5e..00000000000 --- a/contrib/csv-parser/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -project(csv-parser LANGUAGES CXX) - -add_library(${PROJECT_NAME} INTERFACE) -target_include_directories(${PROJECT_NAME} INTERFACE - ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 00000000000..e144e1c0968 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,20 @@ + +list(APPEND HYG_DATABASE_PARSER_SOURCES + hyg-database-to-json.cpp + ${CMAKE_SOURCE_DIR}/src/LZ4Format.cpp +) + +find_package(Threads REQUIRED) +add_executable(hyg-database-parser ${HYG_DATABASE_PARSER_SOURCES}) +target_link_libraries(hyg-database-parser PRIVATE + Threads::Threads + lz4) +target_include_directories(hyg-database-parser PRIVATE + ${CMAKE_SOURCE_DIR}/contrib/string-view-lite + ${CMAKE_SOURCE_DIR}/contrib/csv-parser) + +set_target_properties(${PROJECT_NAME} modelcompiler savegamedump pioneerLib PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS ON +) diff --git a/tools/basic_cbor.h b/tools/basic_cbor.h new file mode 100644 index 00000000000..3ccda61b726 --- /dev/null +++ b/tools/basic_cbor.h @@ -0,0 +1,105 @@ +// Copyright © 2008-2020 Pioneer Developers. See AUTHORS.txt for details +// Licensed under the terms of the GPL v3. See licenses/GPL-3.txt + +/* + The basic guts of a CBOR encoder - we're not concerned with full standards + compliance, merely with efficiently outputting valid data that + nlohmann::json can then read in as JSON data. + + REMEMBER THAT CBOR IS BIG-ENDIAN ENCODED! +*/ + +#pragma once + +#include +#include +#include +#include + +enum class CBORTag { + Integer = 0, + NegInteger = 1, + ByteString = 2, + String = 3, + Array = 4, + Object = 5, + Simple = 7 +}; + +namespace cbor { + + // Thanks to https://stackoverflow.com/questions/2782725/converting-float-values-from-big-endian-to-little-endian + void swap4(uint8_t *data) + { + auto *ptr = reinterpret_cast(data); + uint32_t n = *ptr; + n = ((n >> 8) & 0x00ff00ff) | ((n << 8) & 0xff00ff00); // 1234 -> 2143 + n = ((n >> 16) & 0x0000ffff) | ((n << 16) & 0xffff0000); // 2143 -> 4321 + *ptr = n; + } + + template + void push4(std::vector &out, const T val) + { + size_t cur_size = out.size(); + out.resize(out.size() + 4); + *reinterpret_cast(out.data() + cur_size) = val; + swap4(out.data() + cur_size); + } + + // Push a tag and an additional information value up to 32 bits. + void push_tag(std::vector &out, const CBORTag tag, const uint32_t val) + { + uint8_t tagVal = static_cast(tag) << 5; + if (val < 24) + out.push_back(tagVal | (val & 0xFF)); + + else if (val < 1 << 8) { + out.push_back(tagVal | 24); + out.push_back(val & 0xFF); + } + + else if (val < 1 << 16) { + out.push_back(tagVal | 25); + out.push_back(val >> 8 & 0xFF); + out.push_back(val & 0xFF); + } + + else { + out.push_back(tagVal | 26); + push4(out, val); + } + } + + void push_simple(std::vector &out, const uint8_t simple) + { + out.push_back(static_cast(CBORTag::Simple) << 5 | (simple & 0x1F)); + if (simple > 31) + out.push_back(simple); + } + + void push_float(std::vector &out, const float val) + { + push_simple(out, 26); + push4(out, val); + } + + void push_string(std::vector &out, const std::string &string) + { + push_tag(out, CBORTag::String, string.size()); + if (string.size() > 0) { + size_t cur_size = out.size(); + out.resize(out.size() + string.size()); + + std::memcpy(out.data() + cur_size, string.data(), string.size()); + } + } + + void push_int(std::vector &out, const int val) + { + if (val < 0) + push_tag(out, CBORTag::NegInteger, -1 - val); + else + push_tag(out, CBORTag::Integer, val); + } +} // namespace cbor diff --git a/tools/hyg-database-to-json.cpp b/tools/hyg-database-to-json.cpp new file mode 100644 index 00000000000..03672b5d47e --- /dev/null +++ b/tools/hyg-database-to-json.cpp @@ -0,0 +1,378 @@ +// Copyright © 2008-2020 Pioneer Developers. See AUTHORS.txt for details +// Licensed under the terms of the GPL v3. See licenses/GPL-3.txt + +#include "LZ4Format.h" +#include "basic_cbor.h" + +#include "argh/argh.h" +#include "csv-parser/csv.hpp" +#include "json/json.hpp" + +#include +#include +#include +#include +#include +#include +#include + +using Json = nlohmann::json; +using csv::CSVReader; +using csv::CSVRow; +using nonstd::string_view; +using std::to_string; + +struct StarData { + int32_t starID; + float x, y, z; + std::string spectralType; + float absoluteMagnitude; + std::string starName; + std::string constellation; + float colorIndex; + + Json to_json(); + void to_cbor(std::vector &out); +}; + +/* + # Fields in the HYG database: + (note: field indicies were taken and updated from the GH page and may not be 100% correct) + + 0 id: The database primary key. + 1 hip: The star's ID in the Hipparcos catalog, if known. + 2 hd: The star's ID in the Henry Draper catalog, if known. + 3 hr: The star's ID in the Harvard Revised catalog, which is the same as its number in the Yale Bright Star Catalog. + 4 gl: The star's ID in the third edition of the Gliese Catalog of Nearby Stars. + 5 bf: The Bayer / Flamsteed designation, primarily from the Fifth Edition of the Yale Bright Star Catalog. This is a combination of the two designations. The Flamsteed number, if present, is given first; then a three-letter abbreviation for the Bayer Greek letter; the Bayer superscript number, if present; and finally, the three-letter constellation abbreviation. Thus Alpha Andromedae has the field value "21Alp And", and Kappa1 Sculptoris (no Flamsteed number) has "Kap1Scl". + 6 proper: A common name for the star, such as "Barnard's Star" or "Sirius". I have taken these names primarily from the Hipparcos project's web site, which lists representative names for the 150 brightest stars and many of the 150 closest stars. I have added a few names to this list. Most of the additions are designations from catalogs mostly now forgotten (e.g., Lalande, Groombridge, and Gould ["G."]) except for certain nearby stars which are still best known by these designations. + 7 ra, dec: The star's right ascension and declination, for epoch and equinox 2000.0. + 9 dist: The star's distance in parsecs, the most common unit in astrometry. To convert parsecs to light years, multiply by 3.262. A value >= 100000 indicates missing or dubious (e.g., negative) parallax data in Hipparcos. + 10 pmra, pmdec: The star's proper motion in right ascension and declination, in milliarcseconds per year. + 12 rv: The star's radial velocity in km/sec, where known. + 13 mag: The star's apparent visual magnitude. + 14 absmag: The star's absolute visual magnitude (its apparent magnitude from a distance of 10 parsecs). + 15 spect: The star's spectral type, if known. + 16 ci: The star's color index (blue magnitude - visual magnitude), where known. + 17 x,y,z: The Cartesian coordinates of the star, in a system based on the equatorial coordinates as seen from Earth. +X is in the direction of the vernal equinox (at epoch 2000), +Z towards the north celestial pole, and +Y in the direction of R.A. 6 hours, declination 0 degrees. + 20 vx,vy,vz: The Cartesian velocity components of the star, in the same coordinate system described immediately above. They are determined from the proper motion and the radial velocity (when known). The velocity unit is parsecs per year; these are small values (around 1 millionth of a parsec per year), but they enormously simplify calculations using parsecs as base units for celestial mapping. + 23 rarad, decrad, pmrarad, prdecrad: The positions in radians, and proper motions in radians per year. + 27 bayer: The Bayer designation as a distinct value + 28 flam: The Flamsteed number as a distinct value + 29 con: The standard constellation abbreviation + 30 comp, comp_primary, base: Identifies a star in a multiple star system. comp = ID of companion star, comp_primary = ID of primary star for this component, and base = catalog ID or name for this multi-star system. Currently only used for Gliese stars. + 33 lum: Star's luminosity as a multiple of Solar luminosity. + 34 var: Star's standard variable star designation, when known. + 35 var_min, var_max: Star's approximate magnitude range, for variables. This value is based on the Hp magnitudes for the range in the original Hipparcos catalog, adjusted to the V magnitude scale to match the "mag" field. +*/ + +std::string get_star_name(const CSVRow &row) +{ + // Proper name + if (row[6].is_str()) + return row[6].get(); + + // Compressed Bayer-Flamsteed designation + // e.g. 85 Peg -> 85 Pegasus + if (row[5].is_str()) { + // convert bayer-flamsteed + } + + // Gliese catalog definition + // e.g. Gl 914A + if (row[4].is_str()) { + return row[4].get(); + } + + // Henry Draper catalog number + // e.g. 21406 -> HD 21406 + if (row[2].is_int()) { + return "HD " + to_string(row[2].get()); + } + + // Yale / Harvard catalog number + // e.g. 21406 -> HR 21406 + if (row[3].is_int()) { + return "HR " + to_string(row[3].get()); + } + + // Hipparcos catalog number + // e.g. 11 -> H 11 + if (row[1].is_int()) { + return "HIP " + to_string(row[1].get()); + } + + return "INV " + to_string(row[0].get()); +} + +std::vector parse_csv(std::string &filename, CSVReader &reader) +{ + const auto &column_names = reader.get_col_names(); + if (column_names[0] != "id" && column_names[6] != "proper") { + std::cerr << "Invalid database format in file " + filename << std::endl; + std::cerr << "Expected HYG v3 database format." << std::endl; + return {}; + } + + std::vector returnData; + std::for_each(reader.begin(), reader.end(), [&](const CSVRow &row) { + StarData this_star{}; + + this_star.starID = row[0].get(); + this_star.starName = get_star_name(row); + + try { + this_star.spectralType = row[15].get(); + this_star.absoluteMagnitude = row[14].get(); + + // if we don't have a color index, default to approx. a G0V class star + this_star.colorIndex = row[16].is_num() ? row[16].get() : 0.58f; + } catch (std::runtime_error &e) { + std::cerr << "Error parsing star data for star " << this_star.starID << std::endl; + std::cerr << e.what() << std::endl; + return; + } + + string_view con = row[29].get(); + if (con.size() >= 3) // should only ever be 3 or 0, but manually enforce the length anyways + this_star.constellation = con.substr(0, 3).to_string(); + else + this_star.constellation = ""; + + try { + this_star.x = row[17].get(); + this_star.y = row[18].get(); + this_star.z = row[19].get(); + } catch (std::runtime_error &e) { + std::cerr << "Error parsing star location for star " << this_star.starID << std::endl; + std::cerr << e.what() << std::endl; + return; + } + + returnData.push_back(std::move(this_star)); + }); + + return returnData; +} + +static const float PARSEC_TO_LY = 3.261564; + +int sector_coord(float x) +{ + return int(std::floor(x / PARSEC_TO_LY)) / 8; +} + +float system_loc(float x) +{ + return std::fmod(x / PARSEC_TO_LY, 8.0); +} + +// Object format for JSON / CBOR star data: +/* + { + "name": "HD 12353", + "absmag": 1235.04, + "spectral": "GVIII", + "color": 1.5031, + "constl": "Pegasus", + "sector": [ 4, 5, 6 ], + "coords": [ 5.42134, 0.2145, 7.999 ] + } +*/ + +// while it might be theoretically easier to simply use Json::to_cbor, +// this method is 1.6x faster and saves a bit of memory by using floats +// instead of only doubles. +void StarData::to_cbor(std::vector &out) +{ + // remember to update the number of fields when adding more to the StarData type. + cbor::push_tag(out, CBORTag::Object, 7); + cbor::push_string(out, "name"); + cbor::push_string(out, starName); + + cbor::push_string(out, "absmag"); + cbor::push_float(out, absoluteMagnitude); + + cbor::push_string(out, "spectral"); + cbor::push_string(out, spectralType); + + cbor::push_string(out, "color"); + cbor::push_float(out, colorIndex); + + cbor::push_string(out, "constl"); + cbor::push_string(out, constellation); + + cbor::push_string(out, "sector"); + cbor::push_tag(out, CBORTag::Array, 3); + { + cbor::push_int(out, sector_coord(x)); + cbor::push_int(out, sector_coord(y)); + cbor::push_int(out, sector_coord(z)); + } + + cbor::push_string(out, "coords"); + cbor::push_tag(out, CBORTag::Array, 3); + { + cbor::push_float(out, std::fmod(x, 8.0f)); + cbor::push_float(out, std::fmod(y, 8.0f)); + cbor::push_float(out, std::fmod(z, 8.0f)); + } +} + +Json StarData::to_json() +{ + + Json star_obj = {}; + + star_obj["name"] = starName; + star_obj["absmag"] = absoluteMagnitude; + star_obj["spectral"] = spectralType; + star_obj["color"] = colorIndex; + star_obj["constl"] = constellation; + + // FIXME: temporary workaround until I get clang-format to be sane with initializer lists + /* clang-format off */ + star_obj["sector"] = Json::array({ + int(std::floor(x)) / 8, + int(std::floor(y)) / 8, + int(std::floor(z)) / 8 + }); + + star_obj["coords"] = Json::array({ + std::fmod(x, 8.0f), + std::fmod(y, 8.0f), + std::fmod(z, 8.0f) + }); + /* clang-format on */ + + return star_obj; +} + +int parse_database(argh::parser &sourceFiles, bool outputBinary, bool compress) +{ + std::vector output_accum; + output_accum.reserve(1024); + // unbounded CBOR array + output_accum.push_back(static_cast(CBORTag::Array) << 5 | 31); + + Json fileObject = Json::array(); + + for (auto inputFile : sourceFiles) { + try { + CSVReader reader(inputFile); + std::vector data = parse_csv(inputFile, reader); + + for (auto star : data) { + if (outputBinary) { + star.to_cbor(output_accum); + } else { + fileObject.push_back(star.to_json()); + } + } + } catch (std::runtime_error &e) { + std::cerr << "Error parsing csv file:" << std::endl; + std::cerr << e.what() << std::endl; + return 1; + } + } + + // CBOR 'break' tag + output_accum.push_back(0xFF); + + if (outputBinary) { + string_view data( + reinterpret_cast(output_accum.data()), + output_accum.size()); + + if (compress) + // use compression level 6 - there's no noticable difference in size + // between 6 and 12, and the latter takes 4x as long + std::cout << lz4::CompressLZ4(data, 6); + else + std::cout << data; + + } else { + std::cout << fileObject.dump() << std::endl; + } + + std::flush(std::cout); + return 0; +} + +int run_validation(argh::parser &sourceFiles) +{ + std::string input = sourceFiles[0]; + + auto inputStream = std::ifstream(input, std::ios::ate); + size_t size = inputStream.tellg(); + inputStream.seekg(0, std::ios::beg); + + std::unique_ptr fileData(new char[size]); + inputStream.read(fileData.get(), size); + inputStream.close(); + string_view data{ fileData.get(), size }; + + std::string decompressed_data; + // LZ4 magic number + if (lz4::IsLZ4Format(data.data(), data.size())) { + decompressed_data = lz4::DecompressLZ4(data); + data = { decompressed_data.data(), decompressed_data.size() }; + fileData.reset(); + } + + Json inputJson; + // CBOR magic number + if (reinterpret_cast(data.data())[0] == 0x9F) { + try { + inputJson = Json::from_cbor({ data.data(), data.size() }); + std::cout << inputJson.dump() << std::endl; + } catch (std::exception &e) { + std::cout << e.what() << std::endl; + } + + return 0; + } else { + std::cout << "Invalid input file " << input << ": file is not valid CBOR data!" << std::endl; + return 1; + } +} + +int main(int argc, const char **argv) +{ + std::string help_text = + "This program parses the HYG database of stars and converts it into a listing of\n" + "custom stars for Pioneer to use.\n\n" + "USAGE:\n" + "\thyg-database-parser [OPTIONS] [FILE...]\n\n" + "OPTIONS:\n\n" + "\t-h --help Display this help menu\n" + "\t-o Output file name. If not present, writes to stdout\n" + "\t --binary Output compressed CBOR files (default)\n" + "\t --no-compress Output uncompressed CBOR files\n" + "\t --validate Interpret input files as CBOR and convert to equivalent\n" + "\t JSON for round-trip validation\n"; + + argh::parser args(argv); + + if (args[{ "-h", "--help" }] || args.size() == 0) { + std::cerr << help_text; + return 0; + } + + bool binary = args["--binary"]; + bool compress = !args["--no-compress"]; + bool validate = args["--validate"]; + std::string outputFile; + + auto stdout = std::cout.rdbuf(); + if (args("o") >> outputFile) { + auto fbuf = new std::filebuf(); + fbuf->open(outputFile, std::ios::out); + std::cout.rdbuf(fbuf); + } + + if (validate) + return run_validation(args); + else + return parse_database(args, binary, compress); +}