diff --git a/doc/_static/ingest-options-pull.png b/doc/_static/ingest-options-pull.png new file mode 100755 index 0000000000..1d25db0f3a Binary files /dev/null and b/doc/_static/ingest-options-pull.png differ diff --git a/doc/_static/ingest-options-push.png b/doc/_static/ingest-options-push.png new file mode 100755 index 0000000000..db9aa5338d Binary files /dev/null and b/doc/_static/ingest-options-push.png differ diff --git a/doc/_static/ingest-options-read.png b/doc/_static/ingest-options-read.png new file mode 100755 index 0000000000..6c2b96bb81 Binary files /dev/null and b/doc/_static/ingest-options-read.png differ diff --git a/doc/_static/ingest-options.pptx b/doc/_static/ingest-options.pptx new file mode 100644 index 0000000000..5017c02936 Binary files /dev/null and b/doc/_static/ingest-options.pptx differ diff --git a/doc/_static/ingest-table-types-dependent.png b/doc/_static/ingest-table-types-dependent.png new file mode 100755 index 0000000000..f58dbc3c57 Binary files /dev/null and b/doc/_static/ingest-table-types-dependent.png differ diff --git a/doc/_static/ingest-table-types-partitioned.png b/doc/_static/ingest-table-types-partitioned.png new file mode 100755 index 0000000000..0e5227da3c Binary files /dev/null and b/doc/_static/ingest-table-types-partitioned.png differ diff --git a/doc/_static/ingest-table-types-regular.png b/doc/_static/ingest-table-types-regular.png new file mode 100755 index 0000000000..307ef1fe48 Binary files /dev/null and b/doc/_static/ingest-table-types-regular.png differ diff --git a/doc/_static/ingest-table-types.pptx b/doc/_static/ingest-table-types.pptx new file mode 100644 index 0000000000..147cc20647 Binary files /dev/null and b/doc/_static/ingest-table-types.pptx differ diff --git a/doc/_static/ingest-trans-multiple-chunks.png b/doc/_static/ingest-trans-multiple-chunks.png new file mode 100755 index 0000000000..82b077c859 Binary files /dev/null and b/doc/_static/ingest-trans-multiple-chunks.png differ diff --git a/doc/_static/ingest-trans-multiple-one.png b/doc/_static/ingest-trans-multiple-one.png new file mode 100755 index 0000000000..27ea3c378c Binary files /dev/null and b/doc/_static/ingest-trans-multiple-one.png differ diff --git a/doc/_static/ingest-trans-multiple-scattered.png b/doc/_static/ingest-trans-multiple-scattered.png new file mode 100755 index 0000000000..5ca2698033 Binary files /dev/null and b/doc/_static/ingest-trans-multiple-scattered.png differ diff --git a/doc/_static/ingest-trans-multiple.pptx b/doc/_static/ingest-trans-multiple.pptx new file mode 100644 index 0000000000..f10512be8a Binary files /dev/null and b/doc/_static/ingest-trans-multiple.pptx differ diff --git a/doc/_static/ingest-transaction-fsm.png b/doc/_static/ingest-transaction-fsm.png new file mode 100755 index 0000000000..29d4a076b5 Binary files /dev/null and b/doc/_static/ingest-transaction-fsm.png differ diff --git a/doc/_static/ingest-transaction-fsm.pptx b/doc/_static/ingest-transaction-fsm.pptx new file mode 100644 index 0000000000..7f2c2bbf0c Binary files /dev/null and b/doc/_static/ingest-transaction-fsm.pptx differ diff --git a/doc/_static/ingest-transactions-aborted.png b/doc/_static/ingest-transactions-aborted.png new file mode 100755 index 0000000000..f5e7bcf81c Binary files /dev/null and b/doc/_static/ingest-transactions-aborted.png differ diff --git a/doc/_static/ingest-transactions-aborted.pptx b/doc/_static/ingest-transactions-aborted.pptx new file mode 100644 index 0000000000..adfae1ad62 Binary files /dev/null and b/doc/_static/ingest-transactions-aborted.pptx differ diff --git a/doc/_static/ingest-transactions-failed.png b/doc/_static/ingest-transactions-failed.png new file mode 100755 index 0000000000..d2a0d60d5f Binary files /dev/null and b/doc/_static/ingest-transactions-failed.png differ diff --git a/doc/_static/ingest-transactions-failed.pptx b/doc/_static/ingest-transactions-failed.pptx new file mode 100644 index 0000000000..1ffe243f9f Binary files /dev/null and b/doc/_static/ingest-transactions-failed.pptx differ diff --git a/doc/_static/ingest-transactions-resolved.png b/doc/_static/ingest-transactions-resolved.png new file mode 100755 index 0000000000..a321adbd1c Binary files /dev/null and b/doc/_static/ingest-transactions-resolved.png differ diff --git a/doc/_static/ingest-transactions-resolved.pptx b/doc/_static/ingest-transactions-resolved.pptx new file mode 100644 index 0000000000..932f344e0a Binary files /dev/null and b/doc/_static/ingest-transactions-resolved.pptx differ diff --git a/doc/_static/subchunks.png b/doc/_static/subchunks.png new file mode 100644 index 0000000000..c2fe77ff0d Binary files /dev/null and b/doc/_static/subchunks.png differ diff --git a/doc/admin/data-table-indexes.rst b/doc/admin/data-table-indexes.rst new file mode 100644 index 0000000000..b6108d206e --- /dev/null +++ b/doc/admin/data-table-indexes.rst @@ -0,0 +1,608 @@ + +.. _admin-data-table-index: + +================== +Data Table Indexes +================== + +.. note:: + + All operations on indexes are only allowed on databases that are in the published state. The system will + refuse all requests on databases that are in a process of being ingested. This is done for three reasons: + + #. To avoid interfering with the catalog ingest operations. + #. To prevent returning an inconsistent (or wrong) state of the indexes + #. To prevent transient errors due to potential race conditions since the overall state of the distributed catalogs + might be being changed by the Ingest system. + +.. _admin-data-table-index-intro: + +Introduction +------------ + +This document explains how to use the index management API of the Qserv Replication System. Services provided by the system +are related to the following index management operations in MySQL: + +.. code-block:: sql + + SHOW INDEX FROM ... + CREATE ... INDEX ON
... + DROP INDEX ON
... + +However, unlike the single table operations listed above, the services allow managing groups of related tables distributed +across Qserv worker nodes. Hence, each request for any service refers to a single such group. In particular: + +- if a request refers to the *regular* (fully replicated) table then all instances of the table will be affected by the request. +- otherwise (in the case of the *partitioned* tables) each replica of every chunk of the table will be included in an operation. + Note that the index management services differentiate between the *chunk* tables themselves and the corresponding *full overlap* + tables. When submitting a request, a user will have to choose which of those tables will be included in the operation. + +The latter would also be reflected in the result and error reports made by the services. The JSON objects returned by +the services would return the names of the *final* tables affected by the operations, not the names of the corresponding +*base* name of a table specified in service requests. This is done to facilitate investigating problems with Qserv should +they occur. See more on the *base* and *final* table names in the section: + +- :ref:`ingest-general-base-table-names` (REST) + +Things to consider before creating indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The recommendations mentioned in this section are not complete. Please, get yourself familiarized with +the MySQL documentation on indexes before attempting the operation. Read the following instructions on +the index creation command: + +- https://dev.mysql.com/doc/refman/8.4/en/create-index.html + +Always keep in mind that there is the MySQL (or alike) machinery behind Qserv and that the data tables +in Qserv are presently based on the MyISAM storage engine: + +- https://dev.mysql.com/doc/refman/8.4/en/myisam-storage-engine.html + +These are a few points to consider. + +General: + +- Indexes are created on tables, and not on views. +- Indexes are created in the scope of the table, and not databases. +- In the case of the *partitioned* tables, the *chunk* and *full* overlap may not need to have the same set of indexes. + In some cases, it may not be even possible, for instance, due to different ``UNIQUE`` constraints requirements. +- Please, provide a reasonable comment for an index, even though, comments aren't mandatory. Your comments could be useful + for bookkeeping purposes to know why and what the index was created for. +- Be aware that indexes take additional space on the Qserv workers' filesystems where the database tables are residing. + Potentially, if too many indexes were created, MySQL may run out of disk space and stop working. The rule of thumb for + estimating the amount of space to be taken by an index is based on the sizes of columns mentioned in an index + specification (explained in the next section) multiplied by the total number of rows in a table (on which the index + is being created). There are other factors here, such as the type of a table (regular or partitioned) as well as + the number of replicas for the partitioned tables. The index management service which is used for creating indexes + will also make the best attempt to prevent creating indexes if it will detect that the amount of available disk space + is falling too low. In this case, the service will refuse to create an index and an error will be reported back. + +When choosing a name for an index: + +- The name should be unique (don't confuse this with the UNIQUE keyword used in the index specifications) in the scope + of the table. It means it should not be already taken by some other index. Always check which indexes already exist + in a table before creating a new one. +- Generally, the name must adhere to the MySQL requirements for identifiers as explained + in: + + - https://dev.mysql.com/doc/refman/8.4/en/identifier-qualifiers.html + +- Keep in mind that names of identifiers (including indexes) in Qserv are case-insensitive. This is not the general + requirement in MySQL, where the case sensitivity of identifiers is configurable one way or another. It's because + of a design decision the original Qserv developers made to configure the underlying MySQL machinery. +- To make things simple, restrain from using special characters (all but the underscore one ``_``). +- The length of the name should not exceed 64 characters: + + - https://dev.mysql.com/doc/refman/8.4/en/identifier-length.html + +Error reporting +^^^^^^^^^^^^^^^^ + +All services mentioned in the document are the Replication system's services, and they adhere to the same error reporting +schema. The schema is explained in the document: + +- :ref:`ingest-general-error-reporting` (REST) + +In addition, the index managemnt services may return the service-specific error conditions in the ``error_ext`` attribute: + +.. code-block: + + "error_ext" : { + "job_state" : , + "workers" : { + : { +
: { + "request_status" : , + "request_error" : + }, + ... + }, + ... + } + } + +Where: + +``job_state`` : *string* + The completion status (state) of the corresponding C++ job classes: + + - ``lsst::qserv::replica::SqlGetIndexesJob`` + - ``lsst::qserv::replica::SqlCreateIndexesJob`` + - ``lsst::qserv::replica::SqlDropIndexesJob`` + + The status will be serialized into a string. The explanation of the possible values could be found in + the following C++ header: + + - https://github.com/lsst/qserv/blob/master/src/replica/jobs/Job.h + + Look for this enum type: + + .. code-block:: + + enum ExtendedState { + NONE, ///< No extended state exists at this time. + SUCCESS, ///< The job has been fully implemented. + CONFIG_ERROR, ///< Problems with job configuration found. + FAILED, ///< The job has failed. + ... + } + + Also look for worker/table-specific errors in a JSON object explained below. + +``workers`` : *object* + The JSON object which has unique identifiers of workers (attribute ``worker``) as the keys, where the corresponding + value for the worker is another JSON object which has names of worker-side tables as the next-level keys for + descriptions of problems with managing indexes for the corresponding tables. + + .. note: + + Only the problematic tables (if any) would be mentioned in the report. If no problems were seen during + the index management operations then a collection of workers and tables will be empty. + +``request_status`` : *string* + The completion status (state) of the index creation C++ request classes: + + - ``SqlGetIndexesRequest`` + - ``SqlCreateIndexesRequest`` + - ``SqlDropIndexesRequest`` + + The status will be serialized into a string. More information on possible values could be found in the + following C++ header: + + - https://github.com/lsst/qserv/blob/main/src/replica/requests/Request.h + + Look for this enum type: + + .. code-block:: + + enum ExtendedState { + NONE, /// No extended state exists at this time + SUCCESS, /// The request has been fully implemented + CLIENT_ERROR, /// The request could not be implemented due to + /// an unrecoverable client-side error. + SERVER_BAD, /// Server reports that the request can not be implemented + /// due to incorrect parameters, etc. + SERVER_ERROR, /// The request could not be implemented due to + /// an unrecoverable server-side error. + ... + }; + +``request_error`` : *string* + This string provides an expanded explanation of an error reported by the Replication system's worker (in case if the + request failed on the worker's side and is reported to the service). + +.. note:: + + **Reporting partial successes or failures** + + Since the index management requests may (will) involve multiple tables, the corresponding operations may be potentially + partially successful and partially not successful. All failures for specific indexes which couldn't be managed (created, + queried, or deleted) would be reported as explained in the previous section. For example, that would be a case if a request + was made to drop a known to-exist index, and if no such index existed for some final tables. There may be various reasons + why this might happen. An explanation of the reasons is beyond a scope of this document. The best way a user should treat + this situation is to expect that the service would do the "best effort" of removing the index. It's also allowed to run + the index removal request multiple times. This won't make any harm. All subsequent requests will report failures for all + final tables in the specified group of tables. + +.. _admin-data-table-index-create: + +Creating +-------- + +To create a new index, a user must submit a request to the service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/replication/sql/index`` + +Where the request object has the following schema: + +.. code-block:: + + { "database" : , + "table" : , + "overlap" : , + "index" : , + "spec" : , + "comment" : , + "columns" : [ + { "column" : , + "length " :, + "ascending" : + }, + .. + ], + "auth_key" : + } + +Where: + +``database`` : *string* + The required name of the database where the table resides. + +``table`` : *string* + The required *base* name of the table where the index will be created. + +``overlap`` : *number* := ``0`` + The optional *overlap* flagg indicating a sub-type of the *chunk* table. The value should be one of the following: + + - ``1`` : *full overlap* + - ``0`` : *chunk* + +``index`` : *string* + The required name of the index to be created. + +``spec`` : *string* + The required index specification. The specification should adhere to the MySQL requirements for the index creation command: + + - https://dev.mysql.com/doc/refman/8.4/en/create-index.html + + Where a value of the parameter should be one of the following keywords: ``DEFAULT``, ``UNIQUE``, ``FULLTEXT``, or ``SPATIAL``. + All but the first one (``DEFAULT``) are mentioned in the MySQL documentation. The keywords map to the indfex + creation command: + + .. code-block:: sql + + CREATE [UNIQUE | FULLTEXT | SPATIAL] INDEX index_name ... + + The REST service expects ``DEFAULT`` in those cases when none of the other three specifications are provided. + Any other value or a lack of ant will be considered as an error. + +``comment`` : *string* := ``""`` + The optional comment for the index. The value will be passed via the ``COMMENT`` parameter of the MySQL index + creation command: + + .. code-block:: sql + + CREATE ... INDEX ... COMMENT 'string' ... + +``columns`` : *array* + The required non-empty array of JSON objects keys mentioned in the key_part of the index creation statements: + + .. code-block:: sql + + CREATE ... INDEX index_name ON tbl_name (key_part,...) ... + + .. note:: + + The current implementation of the service doesn't support the extended-expression syntax of key_part introduced + in MySQL version 8. + + These are the mandatory attributes of each key-specific object: + + ``column`` : *string* + The required name of a column. + ``length`` : *number* + The required length of a substring used for the index. It only has a meaning for columns of + types: ``TEXT``, ``CHAR(N)``, ``VARCHAR(N)``, ``BLOB`` , etc. And it must be always 0 for other column + types (numeric, etc.). Otherwise, an index creation request will fail. + + ``ascending`` : *number* + The required sorting order of the column in the index. It translates into ``ASC`` or ``DESC`` optiona + in the key definition in ``key_part``. A value of ``0`` will be interpreted as ``DESC``. Any other positive number + will be imterpreted as to ``ASC``. + +``auth_key`` : *string* + The required zauthorization key. + +Here is an example of the index creation request. Let's supposed we have the *regular* (fully replicated) +table that has the following schema: + +.. code-block:: sql + + CREATE TABLE `sdss_stripe82_01`.`Science_Ccd_Exposure_NoFile` ( + `scienceCcdExposureId` BIGINT(20) NOT NULL, + `run` INT(11) NOT NULL, + `filterId` TINYINT(4) NOT NULL, + `camcol` TINYINT(4) NOT NULL, + `field` INT(11) NOT NULL, + `path` VARCHAR(255) NOT NULL + ); + +And, suppose we are going to create the ``PRIMARY`` key index based on the very first column ``scienceCcdExposureId``. +In this case the request object will look like this: + +.. code-block:: json + + { "database" : "sdss_stripe82_01", + "table" : "Science_Ccd_Exposure_NoFile", + "index" : "PRIMARY", + "spec" : "UNIQUE", + "comment" : "This is the primary key index", + "columns" : [ + { "column" : "scienceCcdExposureId", + "length" : 0, + "ascending" : 1 + } + ], + "auth_key" : "" + } + +The request deliberately misses the optional ``overlap`` attribute since it won't apply to the regular tables. + +Here is how the request could be submitted to the service using ``curl``: + +.. code-block:: bash + + curl 'http://localhost:25081/replication/sql/index' \ + -X POST -H "Content-Type: application/json" \ + -d '{"database":"sdss_stripe82_01","table":"Science_Ccd_Exposure_NoFile", + "index":"PRIMARY","spec":"UNIQUE","comment":"This is the primary key index", + "columns":[{"column":"scienceCcdExposureId","length":0,"ascending":1}], + "auth_key":""}' + +.. _admin-data-table-index-delete: + +Deleting +-------- + +To delete an existing index, a user must submit a request to the service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/replication/sql/index`` + +Where the request object has the following schema: + +.. code-block:: + + { "database" : , + "table" : , + "overlap" : , + "index" : , + "auth_key" : + } + +Where: + +``database`` : *string* + The required name of the database where the table resides. + +``table`` : *string* + The required *base* name of the table where the index will be created. + +``overlap`` : *number* := ``0`` + The optional *overlap* flagg indicating a sub-type of the *chunk* table. The value should be one of the following: + + - ``1`` : *full overlap* + - ``0`` : *chunk* + +``index`` : *string* + The required name of the index to be dropped. + +Here is an example of the index deletion request. It's based on the same table that was mentioned in the previous section. +The request object will look like this: + +.. code-block:: json + + { "database" : "sdss_stripe82_01", + "table" : "Science_Ccd_Exposure_NoFile", + "index" : "PRIMARY", + "auth_key" : "" + } + +Here is how the request could be submitted to the service using ``curl``: + +.. code-block:: bash + + curl 'http://localhost:25081/replication/sql/index' \ + -X DELETE -H "Content-Type: application/json" \ + -d '{"database":"sdss_stripe82_01","table":"Science_Ccd_Exposure_NoFile", + "index":"PRIMARY", + "auth_key":""}' + +.. _admin-data-table-index-inspect: + +Inspecting +---------- + +To inspect the existing indexes, a user must submit a request to the service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/replication/sql/index/:database/:table[?overlap={0|1}]`` + +Where the service path has the following parameters: + +``database`` : *string* + The name of a database affected by the operation. + +``table`` : *string* + The name of the table for which the indexes are required to be collected. + +The optional query parameyter is + +``overlap`` : *number* := ``0`` + The optional *overlap* flagg indicating a sub-type of the *chunk* table. The value should be one of the following: + + - ``1`` : *full overlap* + - ``0`` : *chunk* + +In case of successful completion of the request the JSON object returned by the service will have the following schema: + +.. code-block:: + + { "status": { + "database" : , + "table" : , + "overlap" : , + "indexes" : [ + { "name" : , + "unique" : , + "type" : , + "comment" : , + "status" : , + "num_replicas_total" : , + "num_replicas" : , + "columns" : [ + { "name" : , + "seq" : , + "sub_part" : , + "collation" : + }, + ... + ] + }, + ... + ] + } + } + +Where: + +``name`` : *string* + The name of the index (the key). + +``unique`` : *number* + The numeric flag indicates of the index's keys are unique, where a value of ``0`` means they're unique. Any other + value would mean the opposite. + +``type`` : *string* + The type of index, such as ``BTREE``, ``SPATIAL``, ``PRIMARY``, ``FULLTEXT``, etc. + +``comment`` : *string* + An optional explanation for the index passed to the index creation statement: + + .. code-block:: sql + + CREATE ... INDEX ... COMMENT 'string' ... + +``status`` : *string* + The status of the index. This parameter considers the aggregate status of the index across all replicas of the table. + Possible values here are: + + - ``COMPLETE`` : the same index (same type, columns) is present in all replicas of the table (or its chunks in the case + of the partitioned table) + - ``INCOMPLETE`` : the same index is present in a subset of replicas of the table, where all indexes have the same + definition. + - ``INCONSISTENT`` : instances of the index that have the same name have different definitions in some replicas + + .. warning:: + + The result object reported by the service will not provide any further details on the last status ``INCONSISTENT`` + apart from indicating the inconsistency. It will be up to the data administrator to investigate which replicas have + unexpected index definitions. + +``num_replicas_total`` : *number* + The total number of replicas that exist for the table. This is the target number of replicas where the index is expected + to be present. + +``num_replicas`` : *number* + The number of replicas where the index was found to be present. If this number is not the same as the one reported + in the attribute + ``num_replicas_total`` then the index will be ``INCOMPLETE``. + +``columns`` : *array* + The collection of columns that were included in the index definition. Each entry of the collection has: + + ``name`` : *string* + The name of the column + ``seq`` : *number* + The 1-based position of the column in the index. + ``sub_part`` : *number* + The index prefix. That is, the number of indexed characters if the column is only partly indexed 0 if + the entire column is indexed. + ``collation`` : *string* + How the column is sorted in the index. This can have values ``ASC``, ``DESC``, or ``NOT_SORTED``. + +The following request will report indexes from the fully-replicated table ``ivoa.ObsCore``: + +.. code-block:: bash + + curl http://localhost:25081/replication/sql/index/ivoa/ObsCore -X GET + +The (truncated and formatted for readability) result of an operation performed in a Qserv deployment with 6-workers may +look like this: + +.. code-block:: json + + { "status" : { + "database" : "ivoa" + "indexes" : [ + { "name" : "idx_dataproduct_subtype", + "type" : "BTREE", + "unique" : 0, + "status" : "COMPLETE", + "num_replicas" : 6, + "num_replicas_total" : 6, + "comment" : "The regular index on the column dataproduct_subtype", + "columns" : [ + { "collation" : "ASC", + "name" : "dataproduct_subtype", + "seq" : 1, + "sub_part" : 0 + } + ] + }, + { "name" : "idx_s_region_bounds", + "type" : "SPATIAL", + "unique" : 0, + "status" : "COMPLETE", + "num_replicas" : 6, + "num_replicas_total" : 6, + "comment" : "The spatial index on the geometric region s_region_bounds", + "columns" : [ + { "collation" : "ASC", + "name" : "s_region_bounds", + "seq" : 1, + "sub_part" : 32 + } + ] + }, + { "name" : "idx_lsst_tract_patch", + "type" : "BTREE", + "unique" : 0, + "status" : "COMPLETE", + "num_replicas" : 6, + "num_replicas_total" : 6, + "comment" : "The composite index on the columns lsst_tract and lsst_patch", + "columns" : [ + { "collation" : "ASC", + "name" : "lsst_tract", + "seq" : 1, + "sub_part" : 0 + }, + { "collation" : "ASC", + "name" : "lsst_patch", + "seq" : 2, + "sub_part" : 0 + } + ] + }, + } + +.. note:: + + - The second index ``idx_s_region_bounds`` is spatial. It's based on the binary column of which only + the first 32 bytes are indexed. + + - The third index ``idx_lsst_tract_patch`` is defined over two columns. diff --git a/doc/admin/director-index.rst b/doc/admin/director-index.rst new file mode 100644 index 0000000000..cc8c9a21a5 --- /dev/null +++ b/doc/admin/director-index.rst @@ -0,0 +1,108 @@ +.. _admin-director-index: + +Director Index +============== + +The *director* indexes in Qserv are optional metadata tables associated with the *director* tables, which are explained in: + +- :ref:`ingest-api-concepts-table-types` (CONCEPTS) + +Each row in the index table refers to the corresponding row in the related *director* table. The association is done via +the unique identifier of rows in the *director* table. In additon to the unique identifier, the index table also contains +the number of a chunk (column ``chunkId``) which contains the row in the *director* table. The index table is used to speed up the queries that +use the primary keys of *director* tables to reference rows. + +Here is an example of the index table schema and the schema of the corresponding *director* table ``Object``: + +.. code-block:: sql + + CREATE TABLE qservMeta.test101__Object ( + objectId BIGINT NOT NULL, + chunkId INT NOT NULL, + subChunkId INT NOT NULL, + PRIMARY KEY (objectId) + ); + + CREATE TABLE test101.Object ( + objectId BIGINT NOT NULL, + .. + ); + +The index allows to speed up the following types of queries: + +- point queries (when an identifier is known) +- ``JOIN`` queries (when the *director* table is used as a reference table by the dependent tables) + +Point queries can be executed without scanning all chunk tables of the *director* table. Once the chunk number is known, +the query will be sent to the corresponding chunk table at a worker node where the table resides. For example, +the following query can be several orders of magnitude faster with the index: + +.. code-block:: sql + + SELECT * FROM test101.Object WHERE objectId = 123456; + +The index is optional. If the index table is not found in the Qserv Czar's database, queries will be executed +by scanning all chunk tables of the *director* table. + +The index table can be built in two ways: + +- Automatically by the Qserv Replication/Ingest system during transaction commit time if the corresponding flag + was set as ``auto_build_secondary_index=1`` when calling the database registration service: + + - :ref:`ingest-db-table-management-register-db` (REST) + + .. note:: + + The index tables that are built automatically will be MySQL-partitioned. The partitioning is done + to speed up the index construction process and to benefit from using the distributed transactions + mechanism implemented in the Qserv Ingest system: + + - :ref:`ingest-api-concepts-transactions` (CONCEPTS) + + Having too many partitions in the index table can slow down user queries that use the index. Another side + effect of the partitions is an increased size of the table. The partitions can be consolidated at the database + *publishing* stage as described in the following section: + + - :ref:`ingest-api-concepts-publishing-data` (CONCEPTS) + +- Manually, on the *published* databases using the following service: + + - :ref:`ingest-director-index-build` (REST) + + Note that the index tables built by this service will not be partitioned. + +The following example illustrates rebuilding the index of the *director* table ``Object`` that resides in +the *published* database ``test101``: + +.. code-block:: bash + + curl localhost:25081/ingest/index/secondary \ + -X POST -H "Content-Type: application/json" \ + -d '{"database":"test101", "director_table":"Object","rebuild":1,"local":1}' + +.. warning:: + + The index rebuilding process can be time-consuming and potentially affect the performance of user query processing + in Qserv. Depending on the size of the *director* table, the process can take from several minutes to several hours. + For *director* tables exceeding 1 billion rows, the process can be particularly lengthy. + It's recommended to perform the index rebuilding during a maintenance window or when the system load is low. + +Notes on the MySQL table engine configuration for the index +----------------------------------------------------------- + +The current implementation of the Replication/Ingest system offers the following options for the implementation +of index table: + +- ``innodb``: https://mariadb.com/kb/en/innodb/ +- ``myisam``: https://mariadb.com/kb/en/myisam-storage-engine/ + +Each engine has its own pros and cons. + +The ``innodb`` engine is the default choice. The option is controlled by the following configuration parameter of the Master +Replication Controller: + +- ``(controller,director-index-engine)`` + +The parameter can be set via the command line when starting the controller: + +- ``--controller-director-index-engine=`` diff --git a/doc/admin/index.rst b/doc/admin/index.rst index d9f6e3f44b..24bb3322b6 100644 --- a/doc/admin/index.rst +++ b/doc/admin/index.rst @@ -1,15 +1,14 @@ -.. warning:: - **Information in this guide is known to be outdated.** A documentation sprint is underway which will - include updates and revisions to this guide. +.. _admin: -#################### -Administration Guide -#################### +##################### +Administrator's Guide +##################### .. toctree:: :maxdepth: 4 k8s - qserv-ingest/index - test-set + row-counters + data-table-indexes + director-index diff --git a/doc/admin/qserv-ingest/index.rst b/doc/admin/qserv-ingest/index.rst deleted file mode 100644 index 38cddd282e..0000000000 --- a/doc/admin/qserv-ingest/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _installation-label: - -######################### -The Qserv Ingest Workflow -######################### - -.. toctree:: - :maxdepth: 2 - - input-data - version - run - repcli - itest - diff --git a/doc/admin/row-counters.rst b/doc/admin/row-counters.rst new file mode 100644 index 0000000000..04d9c975e7 --- /dev/null +++ b/doc/admin/row-counters.rst @@ -0,0 +1,176 @@ + +.. _admin-row-counters: + +========================= +Row counters optimization +========================= + +.. _admin-row-counters-intro: + +Introduction +------------ + +Soon after the initial public deployment of Qserv, it was noticed that numerous users were executing the following query: + +.. code-block:: sql + + SELECT COUNT(*) FROM .
+ +Typically, Qserv handles this query by distributing it to all workers, which then count the rows in each chunk table and aggregate the results +at the Czar. This process is akin to the one used for *shared scan* (or simply *scan*) queries. The performance of these *scan* queries can +fluctuate based on several factors: + +- The number of chunks in the target table +- The number of workers available +- The presence of other concurrent queries (particularly slower ones) + +In the best-case scenario, such a scan would take seconds; in the worst case, it could take many minutes or even hours. +This has led to frustration among users, as this query appears to be (and indeed is) a very trivial non-scan query. + +To address this situation, Qserv includes a built-in optimization specifically for this type of query. +Here's how it works: Qserv Czar maintains an optional metadata table for each data table, which stores the row count for each +chunk. This metadata table is populated and managed by the Qserv Replication system. If the table is found, the query +optimizer will use it to determine the number of rows in the table without the need to scan all the chunks. + +Note that this optimization is currently optional for the following reasons: + +- Collecting counters requires scanning all chunk tables, which can be time-consuming. Performing this during + the catalog *publishing* phase would extend the ingest time and increase the likelihood of workflow instabilities + (generally, the longer an operation takes, the higher the probability of encountering infrastructure-related failures). +- The counters are not necessary for the data ingest process itself. They are merely optimizations for query performance. +- Building the counters before the ingested data have been quality assured (Q&A-ed) may not be advisable. +- The counters may need to be rebuilt if the data are modified (e.g., after making corrections to the ingested catalogs). + +The following sections provide detailed instructions on building, managing, and utilizing the row counters, along with formal +descriptions of the corresponding REST services. + +.. note:: + + In the future, the per-chunk counters will be used for optimizing another class of unconditional queries + presented below: + + .. code-block:: sql + + SELECT * FROM .
LIMIT + SELECT `col`,`col2` FROM .
LIMIT + + For these "indiscriminate" data probes, Qserv would dispatch chunk queries to a subset of random chunks that have enough + rows to satisfy the requirements specified in ``LIMIT ``. + +.. _admin-row-counters-build: + +Building and deploying +---------------------- + +.. warning:: + + Depending on the scale of a catalog (data size of the affected table), it may take a while before this operation + will be complete. + +.. note:: + + Please, be advised that the very same operation could be performed at the catalog publishing time as explained in: + + - :ref:`ingest-db-table-management-publish-db` (REST) + + The decision to perform this operation during catalog publishing or as a separate step, as described in this document, + is left to the discretion of Qserv administrators or developers of the ingest workflows. It is generally recommended + to make it a separate stage in the ingest workflow. This approach can expedite the overall transition time of a catalog + to its final published state. Ultimately, row counters optimization is optional and does not impact the core functionality + of Qserv or the query results presented to users. + +To build and deploy the counters, use the following REST service: + +- :ref:`ingest-row-counters-deploy` (REST) + +The service needs to be invoked for every table in the ingested catalog. Here is a typical example of using this service, +which will work even if the same operation was performed previously: + +.. code-block:: bash + + curl http://localhost:25080/ingest/table-stats \ + -X POST -H "Content-Type: application/json" \ + -d '{"database":"test101", + "table":"Object", + "overlap_selector":"CHUNK_AND_OVERLAP", + "force_rescan":1, + "row_counters_state_update_policy":"ENABLED", + "row_counters_deploy_at_qserv":1, + "auth_key":""}' + +This method is applicable to all table types: *director*, *dependent*, *ref-match*, or *regular* (fully replicated). +If the counters already exist in the Replication system's database, they will be rescanned and redeployed. + +It is advisable to compare Qserv's performance for executing the aforementioned queries before and after running this operation. +Typically, if the table statistics are available in Qserv, the result should be returned in a small fraction of +a second (approximately 10 milliseconds) on a lightly loaded Qserv. + +.. _admin-row-counters-delete: + +Deleting +-------- + +In certain situations, such as when there is suspicion that the row counters were inaccurately scanned or during the quality +assurance (Q&A) process of the ingested catalog, a data administrator might need to remove the counters and allow Qserv +to perform a full table scan. This can be achieved using the following REST service: + +- :ref:`ingest-row-counters-delete` (REST) + +Similarly to the previously mentioned service, this one should also be invoked for each table requiring attention. Here is +an example: + +.. code-block:: bash + + curl http://localhost:25080/ingest/table-stats/test101/Object \ + -X DELETE -H "Content-Type: application/json" \ + -d '{"overlap_selector":"CHUNK_AND_OVERLAP","qserv_only":1,"auth_key":""}' + +Note that with the parameters shown above, the statistics will be removed from Qserv only. +This means the system would not need to rescan the tables again if the statistics need to be rebuilt. The counters could simply +be redeployed later at Qserv. To remove the counters from the Replication system's persistent state as well, +the request should have ``qserv_only=0``. + +An alternative approach, detailed in the next section, is to instruct Qserv to bypass the counters for query optimization. + + +.. _admin-row-counters-disable: + +Disabling the optimization at run-time +--------------------------------------- + +.. warning:: + + This is a global setting that affects all users of Qserv. All new queries will be run without the optimization. + It should be used with caution. Typically, it is intended for use by the Qserv data administrator to investigate + suspected issues with Qserv or the catalogs it serves. + +To complement the previously explained methods for scanning, deploying, or deleting row counters for query optimization, +Qserv also supports a run-time switch. This switch can be turned on or off by submitting the following statements via +the Qserv front-ends: + +.. code-block:: sql + + SET GLOBAL QSERV_ROW_COUNTER_OPTIMIZATION = 1 + SET GLOBAL QSERV_ROW_COUNTER_OPTIMIZATION = 0 + +The default behavior of Qserv, when the variable is not set, is to enable the optimization for tables where the counters +are available. + +.. _admin-row-counters-retrieve: + +Inspecting +---------- + +It's also possible to retrieve the counters from the Replication system's state using the following REST service: + +.. code-block:: bash + + curl http://localhost:25080/ingest/table-stats/test101/Object \ + -X GET -H "Content-Type: application/json" \ + -d '{"auth_key":""}' + +- :ref:`ingest-row-counters-inspect` (REST) + +The retrieved information can be utilized for multiple purposes, including investigating potential issues with the counters, +monitoring data distribution across chunks, or creating visual representations of chunk density maps. Refer to the REST service +documentation for more details on this topic. diff --git a/doc/admin/test-set.rst b/doc/admin/test-set.rst deleted file mode 100644 index 8d8a31136a..0000000000 --- a/doc/admin/test-set.rst +++ /dev/null @@ -1,60 +0,0 @@ -********* -Test sets -********* - -.. warning:: - DEPRECATED - -Integration tests -================= - -Once a Qserv mono-node instance is running, you can run advanced integration test on one dataset by using: - -.. code-block:: bash - - qserv-check-integration.py --load --case=01 - # See advanced options with --help option - -You can also run the whole integration test suite, with fixed parameters by using : - -.. code-block:: bash - - qserv-test-integration.py - # See advanced options with --help option - -Results are stored in ``$QSERV_RUN_DIR/tmp/qservTest_case/outputs/``, and are erased before each run. - -Input data sets ---------------- - -Directory structure -^^^^^^^^^^^^^^^^^^^ - -:: - case/ - README.txt - contains info about data - queries/ - data/ -
.schema - contains schema info per table -
.csv.gz - contains data`` - -Database -^^^^^^^^ - -data from case will be loaded into databases called - - ``qservTest_case_mysql``, for mysql - - and ``LSST``, for qserv - -Query file format -^^^^^^^^^^^^^^^^^ - -Query file are named ``_.sql`` where ```` means : - - ``0xxx`` supported, trivial (single object) - - ``1xxx`` supported, simple (small area) - - ``2xxx`` supported, medium difficulty (full scan) - - ``3xxx`` supported, difficult /expensive (e.g. full sky joins) - - ``4xxx`` supported, very difficult (eg near neighbor for large area) - - ``8xxx`` queries with bad syntax. They can fail, but should not crash the server - - ``9xxx`` unknown support - -Files that are not yet supported should have extension ``.FIXME``. diff --git a/doc/conf.py b/doc/conf.py index 13dcb17f0b..aded147ad6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -42,6 +42,7 @@ r"^https://rubinobs.atlassian.net/wiki/", r"^https://rubinobs.atlassian.net/browse/", r"^https://www.slac.stanford.edu/", + r".*/_images/", ] html_additional_pages = { diff --git a/doc/dev/css.rst b/doc/dev/css.rst deleted file mode 100644 index c8bf1a19c0..0000000000 --- a/doc/dev/css.rst +++ /dev/null @@ -1,68 +0,0 @@ -################### -CSS metadata layout -################### - -The Central State System (CSS) maintains information about the data stored in -Qserv. Currently, this consists of the data schema: the names of databases and -what tables they contain, the partitioning parameters used by the database, etc. - -************** -Implementation -************** - -CSS content is stored in a hierarchical key-value structure, with keys -structured much like filename paths. In standard Qserv installations, CSS data -is kept in a MySQL database, although parts of qserv may be tested with CSS -content stored in a more feature-light form, e.g., a file. - -*********** -Packed keys -*********** - -In order to reduce the number of key-value updates when manipulating CSS -content, some portions of the content tree are stored packed in JSON format. -This is signified by presence of ".packed.json" key. The content of this key is -a structure (JSON object) which contains some of the keys appearing at the same -level as ".packed.json" key. For example, suppose CSS content includes -(specified in python dict syntax): - -.. code-block:: python - - { "/foo/bar" : "12345", - "/foo/bar/baz" : "regular data", - "/foo/bar/.packed.json" : '{"a":"aa", "b":"bb"}' } - -The key "/foo/bar/.packed.json" is unpacked, thus the above content will be -interpreted as: - -.. code-block:: python - - { "/foo/bar" : "12345", - "/foo/bar/baz" : "regular data", - "/foo/bar/a" : "aa", - "/foo/bar/b" : "bb" } - -Note that the presence of "/foo/bar/.packed.json" does not prevent the presence -of "/foo/bar/baz". It is not specified what happens if the same key appears in -both regular CSS structure and in packed key value like in this structure: - -.. code-block:: python - - # it is unspecified which value /foo/bar/a has when unpacked - { "/foo/bar" : "12345", - "/foo/bar/a" : "regular key data", - "/foo/bar/.packed.json" : '{"a":"aa", "b":"bb"}' } - -The values in JSON object can be simple types like strings or numbers, complex -objects are not supported there. All values in JSON object are transformed to -strings when unpacked, so the packed object ``"{"key": 1}"`` is treated -identically to ``"{"key": "1"}"``. - -************* -CSS interface -************* - -CSS key-value storage is hidden completely behind CSS interface -(``css/CssAccess`` class). This interface defines all logical operations on -databases, tables, nodes, etc. and translates those operations into key-value -structure manipulatons, including packing and upacking of the keys. diff --git a/doc/dev/index.rst b/doc/dev/index.rst index 1b5134fefd..0fc6b0cc50 100644 --- a/doc/dev/index.rst +++ b/doc/dev/index.rst @@ -1,18 +1,12 @@ -.. warning:: - - **Information in this guide is known to be outdated.** A documentation sprint is underway which will - include updates and revisions to this guide. - .. highlight:: sql -############### -Developer Guide -############### +################# +Developer's Guide +################# .. toctree:: :maxdepth: 2 quick-start-devel doc - css - wmgr-api + scisql diff --git a/doc/dev/quick-start-devel.rst b/doc/dev/quick-start-devel.rst index 4b0cd7b345..1bfdaabbe6 100644 --- a/doc/dev/quick-start-devel.rst +++ b/doc/dev/quick-start-devel.rst @@ -1,3 +1,7 @@ +.. warning:: + + Information in this guide is known to be outdated. The updated version will be available in the near future. + .. _quick-start-devel: ################################ diff --git a/doc/dev/scisql.rst b/doc/dev/scisql.rst new file mode 100644 index 0000000000..d533bd2c43 --- /dev/null +++ b/doc/dev/scisql.rst @@ -0,0 +1,113 @@ +.. note:: + The oficial documentation for the ``sciSQL`` project can be found at https://smonkewitz.github.io/scisql/ + +====================== +sciSQL Developer Notes +====================== + +The document presents the development methodology for ``sciSQL`` in the context of the Qserv container environment. + +Making a build container +------------------------ + +The most straight-forward way to do this involves starting with a generic MariaDB distribution container, +then layering in the development toolchain. Build and test then take place within this customized container. +Here is an example of a small ``Dockerfile`` for this: + +.. code-block:: dockerfile + + FROM mariadb:10.6 + RUN apt-get update \ + && apt-get install -y g++ git make libmariadb-dev python3 python3-pip vim \ + && pip3 install future mako mysqlclient \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3 0 + ENV MARIADB_ROOT_PASSWORD=CHANGEME + VOLUME /root + +With that file the desired image is built by: + +.. code-block:: bash + + docker build -t scisql-dev - < Dockerfile + +The ``ENV`` and ``VOLUME`` lines of the ``Dockerfile`` file are for convenience when running the resulting container. +The stock MariaDB container already has a ``VOLUME`` for ``/var/lib/sql``. Passing this and the additional ``VOLUME`` +for ``/root`` conveniently captures your development state in case the container crashes or you otherwise wish to restart it. + +Running the container +--------------------- + +To run the container built as above: + +.. code-block:: bash + + docker run --name scisql-dev --init -d \ + -v scisql-dev-data:/var/lib/mysql \ + -v scisql-dev-home:/root scisql-dev + +You need to provide names for volumes holding MariaDB state and the root user home directory. This exact same command can be +repeated to re-launch with preserved state (providing, e.g., you check out and build under ``/root`` within the container). + +Now the container will start. If it is a first run on a given data volume, it will take some tens of seconds for MariaDB +to initialize. You can monitor docker logs on the container. When it is ready to go you will see "ready for connections" +near the end of the log. There will be a running MariaDB server within this container, into which scisql can be installed +and tested. + +At this point, it's recommended using a tool like ``VSCode``'s ``"connect to running container"`` to attach +the IDE to the container. It can take ``VSCode`` a little while to download and install its server-side support within +the container (another nice reason to have this persisted in the ``/root`` volume). You may wish to install a few niceties +like your favorite ``.profile``, ssh keys for GitHub, etc. now in ``/root``. + +Building and testing sciSQL +--------------------------- + +Now, inside the container, clone out from ``github.com/smonkewitz/scisql``. Configure and build with: + +.. code-block:: bash + + git clone https://github.com/smonkewitz/scisql.git + cd scisql + ./configure --mysql-includes=/usr/include/mariadb + make + +From here, the somewhat non-obvious iterated incantation to rebuild, deploy into the local MariaDB, and run the test +suite is: + +.. code-block:: bash + + make install && echo $MARIADB_ROOT_PASSWORD | PYTHONPATH=/usr/local/python \ + scisql-deploy.py --mysql-dir=/usr \ + --mysql-socket=/run/mysqld/mysqld.sock \ + --mysql-plugin-dir=/lib/mysql/plugin + +If you don't want to undeploy/redeploy the UDFs and plugin, but are just iterating on the unit tests themselves, +the following shortcut version is also useful: + +.. code-block:: bash + + make install && echo $MARIADB_ROOT_PASSWORD | PYTHONPATH=/usr/local/python \ + scisql-deploy.py --mysql-dir=/usr \ + --mysql-socket=/run/mysqld/mysqld.sock \ + --mysql-plugin-dir=/lib/mysql/plugin \ + --test + +Updating the HTML documentation +------------------------------- + +The HTML documentation is rendered by the Mako template library for Python: https://www.makotemplates.org from comments +embedded in the sources and templates in ``tools/templates/``. Incantation to build is just: + +.. code-block:: bash + + make html_docs + +If you are using ``VSCode``, you can get a tunneled live view on the documentation in your working tree by popping +an additional terminal, and incanting: + +.. code-block:: bash + + cd doc + exec python -m http.server + +Don't forget to add and commit the re-rendered documentation (under ``doc/``) on your PR. After a PR is merged to master, +the documentation will automatically update on github pages at https://smonkewitz.github.io/scisql/ diff --git a/doc/dev/wmgr-api.rst b/doc/dev/wmgr-api.rst deleted file mode 100644 index 27e6cbab62..0000000000 --- a/doc/dev/wmgr-api.rst +++ /dev/null @@ -1,1092 +0,0 @@ -Worker Manager API -################## - -Overview -******** - -Worker manager is a service which runs alongside every qserv worker or czar and -manages many common operations: - -* database operations (table creating, data loading, etc.) -* certain xrootd (plug-in) operations -* management of other services' lifetime - -Primary motivation for implementing wmgr service was to facilitate data loading -in integration tests. It is likely that some of the wmgr functions may be -changed in the future once we implement production-level services for data -loading and distribution. For more details on wmgr design consult pages: - -* https://jira.lsstcorp.org/browse/DM-1900 -* https://dev.lsstcorp.org/trac/wiki/db/Qserv/WMGRDesign - -This document describes wmgr HTTP-basedAPI in details. Note that there is also a -Python client library implemented on top of HTTP API. - - -General API description -*********************** - -Wmgr API is implemented on top of HTTP protocol following RESTful principles -(where possible). All communication with wmgr is performed over HTTP without -encryption (implementing SSL/TLS is feasible but needs solution for certificate -management). - -Wmgr can be configured to require one of few authentication mechanisms - none, -basic, or digest. Lack of SSL/TLS dictates use digest authentication for -production services. - -Whole wmgr API is split into three groups - database, xrootd, and process API. -Below is description of individual API methods (actions). - -Return codes and content types -============================== - -Responses generated by wmgr service use regular HTTP status codes to indicate -success or error. Codes in range 200-299 are used for success, 400-499 for -errors (range 500-599 typically means service failure). - -Normally responses generated by wmgr service include data in JSON format and -have their ``Content-Type`` header set to ``application/json``. This applies to -both successful completion and error conditions. In some cases when error -condition is returned by the transport/framework layer it will have different -content type. - -See also document which defines general structure of JSON objects for returned -response data: -https://confluence.lsstcorp.org/display/DM/REST+API+General+Guidelines - - -Database API -************ - -This section contains description of actions related to database operations - -creating/deleting databases and tables, loading table data, etc. - - -``GET /dbs`` -============ - - Return the list of all existing database names. - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 500 - for database errors - - Response body (for successful completion): - JSON object, "results" property is a list of database objects with keys: - * name: database name - * uri: URL for database operations - - Request/response example:: - - GET /dbs HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "name": "qservMeta", - "uri": "/dbs/qservMeta" - }, - { - "name": "qservResult", - "uri": "/dbs/qservResult" - } - ] - } - -``POST /dbs`` -============= - - Create new database. In addition to creating database itself this method - also grants all privileges on this database to regular non-privileged - account. - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * db: Database name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 201 - if database was successfully created - * 400 - if database name is missing - * 409 - if database already exists - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "result" property is a database object with keys: - * name: database name - * uri: URL for database operations - - Request/response example:: - - POST /dbs HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------bb306714c15713c2 - - --------------------------bb306714c15713c2 - Content-Disposition: form-data; name="db" - - newDB - --------------------------bb306714c15713c2-- - - :: - - HTTP/1.0 201 CREATED - Content-Type: application/json - - { - "result": { - "name": "newDB", - "uri": "/dbs/newDB" - } - } - -``DELETE /dbs/`` -======================== - - Deletes database. - - Parameters: - * dbName: database name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - if database was successfully deleted - * 400 - if parameters have invalid format - * 404 - if database does not exist - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "result" property is a database object with keys: - * name: database name - * uri: URL for database operations - - Request/response example:: - - DELETE /dbs/newDB HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "newDB", - "uri": "/dbs/newDB" - } - } - -``GET /dbs//tables`` -============================ - - Returns the list of tables in a database. - - Parameters: - * dbName: database name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 400 - if parameters have invalid format - * 404 - if database does not exist - * 500 - for database errors - - Response body (for successful completion): - JSON object, "results" property is a list of table objects with keys: - * name: table name - * uri: URL for database operations - - Request/response example:: - - GET /dbs/qservMeta/tables HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "name": "QCzar", - "uri": "/dbs/qservMeta/tables/QCzar" - }, - { - "name": "QInfo", - "uri": "/dbs/qservMeta/tables/QInfo" - }, - ... - ] - } - -``POST /dbs//tables`` -============================= - - Create new table. - - If ``schemaSource`` (see below) is "request" then request must include - ``schema`` parameter which is an SQL DDL statement starting with 'CREATE - TABLE TableName ...'. - - If ``schemaSource`` is "css" then ``table`` parameter must be specified. - Table schema will be extracted from CSS in this case, ``schemaSource`` must - not be given. - - Parameters: - * dbName: database name - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * table: Table name - * schemaSource: source for schema name, possible - values: "request", "css", (default: "request") - * schema: complete "CREATE TABLE ..." statement - (optional) - * chunkColumns: boolean flag, false by default, - accepted values: '0', '1', 'yes', 'no', 'false', 'true'. If set - to true then delete columns "_chunkId", "_subChunkId" from table - (if they exist) and add columns "chunkId", "subChunkId" (if they - don't exist) - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 201 - if table was successfully created - * 400 - if parameters have invalid format or if form - parameters are missing or conflicting - * 409 - if table already exists - * 500 - if table is not defined in CSS or other - database errors - - Response body (for successful completion): - JSON object, "result" property is a table object with keys: - * name: database name - * uri: URL for database operations - - Request/response example:: - - POST /dbs/newDB/tables HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------c5c44964f0f9add0 - - --------------------------c5c44964f0f9add0 - Content-Disposition: form-data; name="schema" - - CREATE TABLE newTable (I INT) - --------------------------c5c44964f0f9add0 - Content-Disposition: form-data; name="table" - - newTable - --------------------------c5c44964f0f9add0-- - - :: - - HTTP/1.0 201 CREATED - Content-Type: application/json - - { - "result": { - "name": "newTable", - "uri": "/dbs/newDB/tables/newTable" - } - } - -``DELETE /dbs//tables/`` -========================================= - - Drop a table and optionally all chunk/overlap tables. - - Parameters: - * dbName: database name - * tblName: table name - - Query Parameters: - * dropChunks: boolean flag, false by default, accepted - values: '0', '1', 'yes', 'no', 'false', 'true' - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - if table was successfully deleted - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "result" property is a table object with keys: - * name: database name - * uri: URL for database operations - - Request/response example:: - - DELETE /dbs/newDB/tables/newTable HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "newTable", - "uri": "/dbs/newDB/tables/newTable" - } - } - - -``GET /dbs//tables//schema`` -============================================= - - Return result of SHOW CREATE TABLE statement for given table. - - Parameters: - * dbName: database name - * tblName: table name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for database errors - - Response body (for successful completion): - JSON object, "result" property is a string with resulting schema. - * name: table name - * uri: URL for database operations - - Request/response example:: - - GET /dbs/newDB/tables/newTable/schema HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": "CREATE TABLE `newTable` (\n `I` int(11) DEFAULT NULL\n) ENGINE=MyISAM DEFAULT CHARSET=latin1" - } - -``GET /dbs//tables//columns`` -============================================== - - Return result of SHOW COLUMNS statement for given table. - - Parameters: - * dbName: database name - * tblName: table name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for database errors - - Response body (for successful completion): - JSON object, "results" property is a list of column - objects with keys: name, type, key, default, null - - Request/response example:: - - GET /dbs/newDB/tables/newTable/columns HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "default": null, - "key": "", - "name": "I", - "null": "YES", - "type": "int(11)" - } - ] - } - -``GET /dbs//tables//chunks`` -============================================= - - Return the list of chunks in a table. For non-chunked table empty list - is returned. - - Parameters: - * dbName: database name - * tblName: table name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for database errors - - Response body (for successful completion): - JSON object, "results" property is a list of chunk objects with keys: - * chunkId: chunk number (integer) - * chunkTable: true if chunk has regular chunk - table (boolean) - * overlapTable: true if chunk has overlap - table (boolean) - * uri: URL for chunk operations - - Request/response example:: - - GET /dbs/qservTest_case01_qserv/tables/Object/chunks HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "chunkId": 7648, - "chunkTable": true, - "overlapTable": true, - "uri": "/dbs/qservTest_case01_qserv/tables/Object/chunks/7648" - }, - ... - ] - } - -``POST /dbs//tables//chunks`` -============================================== - - Create new chunk. - - Parameters: - * dbName: database name - * tblName: table name - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * chunkId: chunk ID, non-negative integer - * overlapFlag: if true then create overlap table too - (default is true), accepted values: '0', '1', 'yes', 'no', - 'false', 'true' - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 201 - if chunk tables were successfully created - * 400 - if parameters have invalid format or if form - parameters are missing or conflicting - * 404 - if table is missing - * 409 - if chunk table already exists - * 500 - if table is not defined in CSS or other - database errors - - Response body (for successful completion): - JSON object, "result" property is a chunk object with keys: - * chunkId: chunk number (integer) - * chunkTable: true if chunk has regular chunk - table (boolean) - * overlapTable: true if chunk has overlap - table (boolean) - * uri: URL for chunk operations - - Request/response example:: - - POST /dbs/newDB/tables/newTable/chunks HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------df029da2ec8387ce - - --------------------------df029da2ec8387ce - Content-Disposition: form-data; name="chunkId" - - 1000 - --------------------------df029da2ec8387ce-- - - :: - - HTTP/1.0 201 CREATED - Content-Type: application/json - - { - "result": { - "chunkId": 1000, - "chunkTable": true, - "overlapTable": true, - "uri": "/dbs/newDB/tables/newTable/chunks/1000" - } - } - -``DELETE /dbs//tables//chunks/`` -========================================================== - - Delete chunk from a table, both chunk data and overlap data is dropped. - - Parameters: - * dbName: database name - * tblName: table name - * chunkId: chunk number, non-negative integer - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - if table was successfully deleted - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "result" property is a chunk object with keys: - * chunkId: chunk number (integer) - * chunkTable: true if chunk has regular chunk - table (boolean) - * overlapTable: true if chunk has overlap - table (boolean) - * uri: URL for chunk operations - - Request/response example:: - - DELETE /dbs/newDB/tables/newTable/chunks/1000 HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "chunkId": 1000, - "chunkTable": true, - "overlapTable": true, - "uri": "/dbs/newDB/tables/newTable/chunks/1000" - } - } - -``POST //tables//data`` -======================================== - - Upload data into a table using file format supported by mysql command - LOAD DATA [LOCAL] INFILE. - - Parameters: - * dbName: database name - * tblName: table name - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * table-data: the data come in original LOAD DATA - format with ``binary/octet-stream`` content type and binary - encoding, and it may be compressed with gzip. - * load-options: set of options encoded with usual - ``application/x-www-form-urlencoded`` content type, options are: - - delimiter - defaults to TAB - - enclose - defaults to empty string (strings are not enclosed) - - escape - defaults to backslash - - terminate - defaults to newline - - compressed - "0" or "1", by default is guessed from file extension (.gz) - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 201 - if chunk tables were successfully created - * 400 - if parameters have invalid format or if form - parameters are missing or conflicting - * 404 - if table is missing - * 409 - if chunk table already exists - * 500 - if table is not defined in CSS or other - database errors - - Response body (for successful completion): - JSON object, "result" property is an object with keys: - * status: string "OK" - * count: count of rows added to a table - - Request/response example:: - - POST /dbs/newDB/tables/newTable/data HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------345ad77805210ac6 - - --------------------------345ad77805210ac6 - Content-Disposition: form-data; name="table-data"; filename="table.dat.gz" - Content-Type: application/octet-stream - - .....<.U..table.dat.3.2400.2.bS..;....... - - --------------------------345ad77805210ac6 - Content-Disposition: form-data; name="load-options" - - compressed=1&delimiter=%2C - --------------------------345ad77805210ac6-- - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "count": 4, - "status": "OK" - } - } - -``POST //tables//chunks//data`` -========================================================= - - Upload data into a chunk table using file format supported by mysql - command LOAD DATA [LOCAL] INFILE. - - This method works exactly as previous one taking the same form parameter - but it loads data into a chunk and has additional URL parameter specifying - chunk number. - -``POST //tables//chunks//overlap`` -============================================================ - - Upload data into overlap table using file format supported by mysql - command LOAD DATA [LOCAL] INFILE. - - This method works exactly as previous one taking the same form parameter - but it loads data into an overlap table and has additional URL parameter - specifying chunk number. - -``GET /dbs//tables//index`` -============================================ - - Return index data (array of (objectId, chunkId, subChunkId) triplets). - - Parameters: - * dbName: database name - * tblName: table name - * chunkId: chunk number (non-negative integer) - - Query Parameters: - * columns: specifies comma-separated list of three - column names. Default column names are "objectId", "chunkId", - "subChunkId". Result returns columns in the same order as they - are specified in 'columns' argument. - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for successful return - * 400 - if parameters have invalid format - * 404 - if table does not exist - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "result" property is an object with keys: - * description: array of three objects - describing columns, each with keys "name" (column name) and - "type" (MySQL type name) - * rows: array of arrays of integers - - Request/response example:: - - GET /dbs/qservTest_case01_qserv/tables/Object/index HTTP/1.1 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "description": [ - { - "name": "objectId", - "type": "LONGLONG" - }, - { - "name": "chunkId", - "type": "LONG" - }, - { - "name": "subChunkId", - "type": "LONG" - } - ], - "rows": [ - [ - 386937898687249, - 6630, - 897 - ], - [ - 386942193651348, - 6630, - 660 - ], - ... - ] - } - } - -``GET /dbs//tables//chunks//index`` -============================================================= - - Return index data (array of (objectId, chunkId, subChunkId) triplets) - for single chunk. - - Does the same as previous method but for one chunk from partitioned - table. Useful when index for whole table may be too big. - - Request/response example:: - - GET /dbs/qservTest_case01_qserv/tables/Object/chunks/7648/index HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "description": [ - { - "name": "objectId", - "type": "LONGLONG" - }, - { - "name": "chunkId", - "type": "LONG" - }, - { - "name": "subChunkId", - "type": "LONG" - } - ], - "rows": [ - [ - 433306365599363, - 7648, - 5 - ], - [ - 433314955527561, - 7648, - 10 - ], - ... - ] - } - } - - -Xrootd API -********** - -This section contains description of actions related to xrootd operations - e.g. -publishing database via xrootd. - - -``GET /xrootd/dbs`` -=================== - - Return the list of databases known to xrootd. - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for success - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "results" property is a list of database objects with keys: - * name: database name - * uri: URL for *xrootd* database operations - - Request/response example:: - - GET /xrootd/dbs HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "name": "qservTest_case01_qserv", - "uri": "/xrootd/dbs/qservTest_case01_qserv" - } - ] - } - -``POST /xrootd/dbs`` -==================== - - Register new database in xrootd chunk inventory. - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * db: database name (required) - * xrootdRestart: if set to 'no' then do not restart - xrootd (defaults to yes) - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 201 - if database was successfully registered - * 400 - if parameters are missing or have invalid - format - * 409 - if database is already registered - * 500 - on other database errors - - Response body (for successful completion): - JSON object, "results" property is a database object with keys: - * name: database name - * uri: URL for *xrootd* database operations - - Request/response example:: - - POST /xrootd/dbs HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------370e6e4d60b7499e - - --------------------------370e6e4d60b7499e - Content-Disposition: form-data; name="db" - - newDB - --------------------------370e6e4d60b7499e - Content-Disposition: form-data; name="xrootdRestart" - - no - --------------------------370e6e4d60b7499e-- - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "newDB", - "uri": "/xrootd/dbs/newDB" - } - } - -``DELETE /xrootd/dbs/`` -=============================== - - Unregister database from xrootd chunk inventory. - - Parameters: - * dbName: database name - - Query Parameters: - * xrootdRestart: if set to 'no' then do not restart - xrootd (defaults to yes) - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for success - * 400 - if parameters have invalid format - * 409 - if database is not registered - * 500 - for other database errors - - Response body (for successful completion): - JSON object, "results" property is a database object with keys: - * name: database name - * uri: URL for *xrootd* database operations - - Request/response example:: - - DELETE /xrootd/dbs/newDB?xrootdRestart=no HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "newDB", - "uri": "/xrootd/dbs/newDB" - } - } - -``GET /xrootd/dbs/`` -============================ - - Return the list of chunk IDs in a database as known to xrootd. - - .. note:: Not implemented yet. - - -Services API -************ - -This section contains description of actions related to operations on services - -e.g. stopping and starting processes. - - -``GET /services`` -================= - - Return the list of services. - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for success - - Response body (for successful completion): - JSON object, "results" property is a list of service objects with keys: - * name: service name - * uri: URL for service operations - - Request/response example:: - - GET /services HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "results": [ - { - "name": "xrootd", - "uri": "/services/xrootd" - }, - { - "name": "mysqld", - "uri": "/services/mysqld" - } - ] - } - -``GET /services/`` -=========================== - - Return service state. - - Parameters: - * service: service name - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for success - * 404 - for invalid service name - - Response body (for successful completion): - JSON object, "result" property is a service object with keys: - * name: service name - * state: one of "active" or "stopped" - * uri: URL for service operations - - Request/response example:: - - GET /services/mysqld HTTP/1.0 - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "mysqld", - "state": "active", - "uri": "/services/mysqld" - } - } - -``PUT /services/`` -=========================== - - Execute some action on service, like "stop" or "restart". - - Parameters: - * service: service name - - Request headers: - * Content-Type: required as ``multipart/form-data`` - - Form Parameters: - * action: action: one of 'stop', 'start', 'restart' - (required) - - Response headers: - * Content-Type: ``application/json`` - - Status Codes: - * 200 - for success - * 400 - if parameters are missing or have invalid - format - * 409 - if action has failed - - Response body (for successful completion): - JSON object, "result" property is a service object with keys: - * name: service name - * state: state of service after action, one of - "active" or "stopped" - * uri: URL for service operations - - Request/response example:: - - PUT /services/mysqld HTTP/1.0 - Content-Type: multipart/form-data; boundary=------------------------48169e483bc7d12e - - --------------------------48169e483bc7d12e - Content-Disposition: form-data; name="action" - - restart - --------------------------48169e483bc7d12e-- - - :: - - HTTP/1.0 200 OK - Content-Type: application/json - - { - "result": { - "name": "mysqld", - "state": "active", - "uri": "/services/mysqld" - } - } diff --git a/doc/documenteer.toml b/doc/documenteer.toml index 7ae39c6f41..811b923f11 100644 --- a/doc/documenteer.toml +++ b/doc/documenteer.toml @@ -1,5 +1,5 @@ [project] title = "Qserv" -copyright = "2015-2023 Association of Universities for Research in Astronomy, Inc. (AURA)" +copyright = "2015-2024 Association of Universities for Research in Astronomy, Inc. (AURA)" base_url = "https://qserv.lsst.io" github_url = "https://github.com/lsst/qserv" diff --git a/doc/index.rst b/doc/index.rst index e0922133e7..aec51063d6 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -3,4 +3,5 @@ /intro/index /admin/index /user/index + /ingest/index /dev/index diff --git a/doc/ingest/api/advanced/async-concurrency.rst b/doc/ingest/api/advanced/async-concurrency.rst new file mode 100644 index 0000000000..ec9722e101 --- /dev/null +++ b/doc/ingest/api/advanced/async-concurrency.rst @@ -0,0 +1,75 @@ +.. _ingest-api-advanced-concurrency: + +Concurrency control when processing async requests +-------------------------------------------------- + +.. note:: + + This optional mechanism is designed specifically for handling contribution requests submitted asynchronously by reference. + Workflows using the synchronous interface should implement their own request load balancing strategies. + +The current implementation of the worker request processor for *asynchronously* submitted contribution requests +(:ref:`ingest-worker-contrib-by-ref`) follows a straightforward model: + +- Incoming requests are managed by a single input queue. +- Requests are queued based on their creation timestamp. +- At server startup, a fixed-size pool of processing threads is initialized. +- The pool size is configured by the worker's parameter ``(worker,num-async-loader-processing-threads)``, which can be set + at startup using the command line option ``--worker-num-async-loader-processing-threads=``. By default, this value + is twice the number of hardware threads on the worker host. +- Worker threads process requests sequentially from the queue's head. + +This model can cause issues in some deployments where resource availability is limited, such as: + +- Remote data sources (e.g., HTTP servers, object stores) may become overloaded if the total number of parallel requests from all + workers exceeds the service capacity. For instance, disk I/O performance may degrade when services read too many files simultaneously. +- The performance of locally mounted distributed filesystems at workers may degrade if there are too many simultaneous file + reads, especially when input data is located on such filesystems. +- Ongoing ingest activities can significantly degrade Qserv performance for user queries due to resource contention (memory, disk I/O, network I/O, CPU). +- The timing of ingests can be problematic. For instance, massive ingests might be scheduled at night, while less intensive + activities occur during the day when user activity is higher. + +Adjusting the number of processing threads in the service configuration is not an optimal solution because it requires restarting +all worker servers (or the entire Qserv in Kubernetes-based deployments) whenever the ingest workflow needs to manage resource usage. +Additionally, the constraints can vary based on the specific context in both "space" (ingesting particular databases from specific sources) +and "time" (when Qserv is under heavy load from user queries). + +To mitigate these issues, the API provides a feature to control the concurrency level of processed requests. Limits can be configured +at the database level. Workflows can query or set these limits using the existing REST services, as detailed in the following section: + +- :ref:`ingest-config` (REST) + +Here is an example of how to configure all workers to limit concurrency to a maximum of 4 requests per worker for +the database ``test101``: + +.. code-block:: bash + + curl http://localhost:25081/ingest/config \ + -X PUT -H 'Content-Type: application/json' \ + -d'{"database":"test101","ASYNC_PROC_LIMIT":4,"auth_key":""}' + +Specifying a value of ``0`` will remove the concurrency limit, causing the system to revert to using the default number of processing threads. + +Workflows can modify the limit at any time, and changes will take effect immediately. However, the new limit will only +apply to requests that are pulled from the queue after the change. Existing requests in progress will not be interrupted, +even if the limit is reduced. + +The following example demonstrates how to use the related service to retrieve the current concurrency limit for a specific database: + +.. code-block:: bash + + curl 'http://localhost:25081/ingest/config?database=test101' -X GET + +This would return: + +.. code-block:: json + + { "config": { + "ASYNC_PROC_LIMIT": 4, + "database": "test101" + }, + "error": "", + "error_ext": {}, + "success": 1, + "warning": "No version number was provided in the request's query." + } diff --git a/doc/ingest/api/advanced/charset.rst b/doc/ingest/api/advanced/charset.rst new file mode 100644 index 0000000000..f0dacccef7 --- /dev/null +++ b/doc/ingest/api/advanced/charset.rst @@ -0,0 +1,93 @@ +.. _ingest-api-advanced-charset: + +Character sets in contributions +------------------------------- + +.. note:: + + This feature was added in the REST API version ``15``. + +Background +^^^^^^^^^^ + +The current implementation of the Qserv Replication/Ingest system relies on the following SQL statement for ingesting +table data: + +.. code-block:: sql + + LOAD DATA [LOCAL] INFILE ... + +According to the MySQL/MariaDB documentation https://mariadb.com/kb/en/load-data-infile/#character-sets), the database +server may interpret or transform the input data (CSV) differently depending on the character set assumed by the operation. +Incorrect data transformation can result in distorted data being stored in the target table. This issue is most likely +to occur when the input data were produced from another MySQL server using the following SQL statement: + +.. code-block:: sql + + SELECT INTO OUTFILE ... + +The right approach to address this situation is twofold: + +- Know (or configure) a specific character set when producing the data files (CSV). +- Use the same character set when loading these files into Qserv. + +Before version ``15`` of the API, the Qserv Ingest System did not provide any control for the latter. The system relied on +the default character set in the database service configuration. This could lead to the following problems: + +- The character set configured in this way may be random, as it may be set either by a database administrator who might not + be aware of the problem (or the origin and parameters of the input data to be loaded into Qserv). Besides, the default + character set may change between MySQL/MariaDB versions, or it could depend on the choice of the OS (Debian, CentOS, etc.) + for the base Docker image. + +- Different input data may be produced with different character sets. In this case, setting one globally at the database + level (even if this could be done deterministically) wouldn't work for all inputs. + +As of version ``15``: + +- The implementation is reinforced to assume a specific default character set for the data loading operation. The default + is presently set to ``latin1``. +- The ingest API is extended to allow overriding the default character set when loading contributions into Qserv. + +Global configuration +^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + + Setting the server-side configuration parameters of the Ingest system is not supposed to be under the direct control of + the workflow developers. Managing Qserv deployments and the configuration of the Ingest system is the responsibility of + the Qserv administrators. Therefore, the workflow developers are advised to set the name of the desired character set in + each contribution request as explained in the subsections below. + +The configuration of the workers now includes the following parameter ``(worker,ingest-charset-name)``. The parameter can +be set at the server startup via the command line option ``--worker-ingest-charset-name=``. The default value is +``latin1``. + +In the REST API +^^^^^^^^^^^^^^^ + +Overriding the default character set value is supported by all forms of contribution ingest services, whether contributions +are submitted *by reference* or *by value*, and whether they are *synchronous* or *asynchronous* requests. + +The desired character set value is specified via the ``charset_name`` parameter. This parameter should be a string representing +the name of the character set. It is optional; if not provided, the Ingest system will use its default character set value. + +All services that return the status of contribution requests will include the character set name used by the Ingest system when +processing the contributions. This information is reported in the JSON response object as: + +.. code-block:: + + { "charset_name": + } + + +qserv-replica-file +^^^^^^^^^^^^^^^^^^ + +The command line tool :ref:`ingest-tools-qserv-replica-file` allows ingesting contributions via the proprietary binary protocol +and has an option ``--charset-name=``: + +.. code-block:: bash + + qserv-replica-file INGEST {FILE|FILE-LIST|FILE-LIST-TRANS} \ + ... \ + --charset-name= diff --git a/doc/ingest/api/advanced/config.rst b/doc/ingest/api/advanced/config.rst new file mode 100644 index 0000000000..dbccf73099 --- /dev/null +++ b/doc/ingest/api/advanced/config.rst @@ -0,0 +1,105 @@ +.. _ingest-api-advanced-global-config: + +Global configuration options +---------------------------- + +This section presents the configuration parameters of Qserv and the Replication/Ingest system which may +affect the ingest activities. This section provides instructions for finding values of the parameters +via the API, explains roles of the parameters, and outlines possble usage of the parameters by the workflow. + +The number of workers +^^^^^^^^^^^^^^^^^^^^^ + +The total number of Qserv workers is the main factor contributing to the performance of the catalog ingest operations. +The number can be obtained using the following REST service: + +- :ref:`ingest-config-global-workers` (REST) + +The workflow needs to analyze a section ``config.workers`` to select workers in the following state (both apply): + +- ``is-enabled=1`` +- ``is-read-only=0`` + +There are a few possibilities how the workflow could use this information. For example, the workflow +could start a separate transaction (or a set of transactions) per worker. + +Worker-specific parameters +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second group of parameters found in the section ``config.general.worker`` is related to resources which +are available to the individual worker ingest services for processing contributions. These are instructions +for some of the most relevant parameters: + +- ``num-loader-processing-threads``: + + The parameter affects a flow of ingest requests made via the proprietary binary protocol using the command-line + tool :ref:`ingest-tools-qserv-replica-file`. To achieve the maximum throughput of the ingest the workflows + should aim at having each participated worker loaded with as many parallel requests as there are threads + reported by this parameter. + + .. warning:: + + Exceeding the number of threads will result in having clients to wait for connections to be established + with the workers. In some cases this may lead to the performance degradation if the network connection + is unstable. + +- ``num-http-loader-processing-threads``: + + The parameter affects a flow of ingest requests submitted via the HTTP-based ingest service. Note that + this service is used for processing *synchronous* contribution requests and for submitting the *asynchronous* + requests to the service. + + The workflow may use a value of the parameter differently, depenidng on a type of the contribution request. + Requests which are *synchronous* should be submitted to the service in a way that the number of such requests + per worker was close to the number of threads reported by this parameter. In this case the workflow should + expect the maximum throughput of the ingest. The *asynchronous* requests aren't affected by the parameter, + unless another workflow is using the service in the *synchronous* mode at the same time. + +- ``num-async-loader-processing-threads``: + + The parameter represents the number of the request processing threads in a thread pool allocated for + the *asynchronous* contribution requests. The workflow should aim at having the number of unfinished + *asynchronous* requests submitted to the service close to (or exceeding) the number of threads reported + by this parameter. To do so, the workflow should monitor the number and the status of the *asynchronous* + requests at each worker and submit new requests to the service when the number of the requests being processed + is less than the number of threads. The relevant services for monitoring the contribution requests at workers + are documented in: + + - :ref:`ingest-worker-contrib-get` (WORKER) + + .. note:: + + An alternative approach is to submit all *asynchronous* requests to the relevant worker services at once. + The services will take care of processing the requests in the same order they were submitted. This approach + may not work well where a specific order of the requests is important, or if all input data is not available + at the time of the submission. + +- ``ingest-charset-name``: + + The name of a character set for parsing the payload of the contributions. The workflow may override the default + value of the parameter if the payload of the contributions is encoded in a different character set. See an + attrubute ``charset_name`` in: + + - :ref:`ingest-worker-contrib-by-ref` (WORKER) + - :ref:`ingest-worker-contrib-by-val` (WORKER) + +- ``ingest-num-retries``, ``ingest-max-retries``: + + These parameters are related to the number of the automatic retries of the failed *asynchronous* requests + specific in the parameter ``num_retries`` of the contribution request. The workflow may adjust the number + of such retries if needed. A good example is when the workflow knows that a connection to the data source + (a Web server or the object store) is unstable, or if the server might be overloaded. The workflow may increase + the number of retries to ensure that the data is ingested successfully. + + .. note:: + + The parameter ``ingest-max-retries`` is a hard limit for the number of retries regardless of what's + specified in the request's attribute ``num_retries``. + +- ``loader-max-warnings``: + + This parameter sets the default number for the number of warnings that the worker ingest service can + capture from MySQL after attempting to ingest a contribution. The workflow may adjust the parameter + for individual contributions by setting the desired limit in the request's attribute ``max_warnings``. + The main purpose for setting the limit higher than the default value is to debug problem with the + data of the contributions. diff --git a/doc/ingest/api/advanced/contributions.rst b/doc/ingest/api/advanced/contributions.rst new file mode 100644 index 0000000000..21fad852f2 --- /dev/null +++ b/doc/ingest/api/advanced/contributions.rst @@ -0,0 +1,51 @@ +.. _ingest-api-advanced-contributions: + +Options for making contribution requests +---------------------------------------- + +The API provides a variety of options for making contribution requests. The choice of the most suitable method depends on +the specific needs of the client application. The following list outlines the available ingest modes: + +- **pull**: Tell the ingest service to pull the data from the Web server: + + - :ref:`ingest-worker-contrib-by-ref` (WORKER) + +- **read**: Tell the ingest service to read the data directly from the locally mounted filesystem that is accessible + to the worker: + + - :ref:`ingest-worker-contrib-by-ref` (WORKER) + +- **push**: Send the data over the proprietary binary protocol or ``http`` protocol to the ingest service: + + - :ref:`ingest-tools-qserv-replica-file` (TOOLS) + - :ref:`ingest-worker-contrib-by-val` (WORKER) + + The workflow can either read the data from the local filesystem or access the data directly from memory. + +All methods support the ``CSV`` data format. Additionally, the **push** mode over the ``http`` protocol also supports +the ``JSON`` format. For more details, refer to the documentation of the respective service. + +The **pull** and **read** modes also support both *synchronous* and *asynchronous* data ingestion approaches. + +The following diagrams illustrate the three modes of making contribution requests: + +Pulling data from the Web server +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: /_static/ingest-options-pull.png + :target: ../../../_images/ingest-options-pull.png + :alt: Pull Mode + +Reading data from the local filesystem of the worker +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: /_static/ingest-options-read.png + :target: ../../../_images/ingest-options-read.png + :alt: Read Mode + +Pushing data from the workflow to the service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. image:: /_static/ingest-options-push.png + :target: ../../../_images/ingest-options-push.png + :alt: Read Mode diff --git a/doc/ingest/api/advanced/directors.rst b/doc/ingest/api/advanced/directors.rst new file mode 100644 index 0000000000..0e0efc82bc --- /dev/null +++ b/doc/ingest/api/advanced/directors.rst @@ -0,0 +1,162 @@ + +.. _ingest-api-advanced-directors: + +Databases with many director tables +=================================== + +.. tip:: + + See the following document first: + + - :ref:`ingest-api-concepts-table-types` (CONCEPTS) + +The API supports ingesting multiple *director* tables within a single catalog. Each *director* table can optionally have its +own set of *dependent* tables. This section demonstrates the necessary configuration for the following table set: + +.. table:: + + +----------+-----------+ + | director | dependent | + +==========+===========+ + | dir_1 | dep_1_1 | + | +-----------+ + | | dep_1_2 | + | +-----------+ + | | dep_1_3 | + +----------+-----------+ + | dir_2 | dep_2_1 | + | +-----------+ + | | dep_2_2 | + +----------+-----------+ + | dir_3 | | + +----------+-----------+ + +In this example, there are 3 *director* tables. Each (except the third one ``dir_3``) has its own set of *dependent* tables. +The key attributes that govern dependencies between the tables are specified in the section :ref:`ingest-db-table-management-register-table` +(REST). The document mentions the following required attributes that need to be provided in the JSON specification of +each table: + +.. table:: + + +--------------------+----------+-----------------------------------------------------------------------------------------------------+ + | attribute | value | comment | + +====================+==========+=====================================================================================================+ + | ``is_partitioned`` | ``1`` | Same value for both *director* and *dependent* tables. | + +--------------------+----------+-----------------------------------------------------------------------------------------------------+ + | ``director_table`` | *string* | Where: | + | | | | + | | | - *director* tables should have the empty string here. | + | | | - *dependent* tables should have the name of the corresponding *director* table here. | + +--------------------+----------+-----------------------------------------------------------------------------------------------------+ + | ``director_key`` | *string* | The non-empty string specifying the name of the corresponding column must be | + | | | provided here. Depending on the type of the table, this name corresponds to either: | + | | | | + | | | - The *primary key* in the *director* table, or | + | | | - The *foreign key* pointing to the corresponding director's primary key in the *dependent* tables. | + +--------------------+----------+-----------------------------------------------------------------------------------------------------+ + +The *director* tables are **required** to have the following attributes: + +.. table:: + + +-------------------+----------+---------------------------------------------------------------------------+ + | attribute | value | comment | + +===================+==========+===========================================================================+ + | ``latitude_key`` | *string* | The names of the director table's columns that were used for partitioning | + | ``longitude_key`` | | the table data into chunks. | + +-------------------+----------+---------------------------------------------------------------------------+ + +The *dependent* tables may also include the attributes ``latitude_key`` and ``longitude_key`` if their input data was partitioned using the columns specified +by these attributes. If not, these attributes can be omitted from the table's JSON specification. + +The following table illustrates how JSON configurations for all the above-mentioned tables might look like +(the examples were simplified for clarity): + +.. table:: + + +-----------------------------------+-------------------------------------------+ + | director | dependents | + +===================================+===========================================+ + | .. code-block:: json | .. code-block:: json | + | | | + | { "table" : "dir_1", | { "table" : "dep_1_1", | + | "is_partitioned" : 1, | "is_partitioned" : 1, | + | "director_table" : "", | "director_table" : "dir_1", | + | "director_key" : "objectId", | "director_key" : "dep_objectId" | + | "latitude_key" : "ra", | } | + | "longitude_key" : "dec" | | + | } | **Note**: Attributes ``latitude_key`` and | + | | ``longitude_key`` were not provided. | + | | is allowed for the dependent tables. | + | | | + | | .. code-block:: json | + | | | + | | { "table" : "dep_1_2", | + | | "is_partitioned" : 1, | + | | "director_table" : "dir_1", | + | | "director_key" : "dep_objectId" | + | | "latitude_key" : "", | + | | "longitude_key" : "" | + | | } | + | | | + | | **Note**: Attributes ``latitude_key`` and | + | | ``longitude_key`` were provided. However | + | | the values were empty strings, which is | + | | allowed for the dependent tables. | + | | | + | | .. code-block:: json | + | | | + | | { "table" : "dep_1_3", | + | | "is_partitioned" : 1, | + | | "director_table" : "dir_1", | + | | "director_key" : "dep_objectId" | + | | "latitude_key" : "dep_ra", | + | | "longitude_key" : "dep_dec" | + | | } | + +-----------------------------------+-------------------------------------------+ + | .. code-block:: json | .. code-block:: json | + | | | + | { "table" : "dir_2", | { "table" : "dep_2_1", | + | "is_partitioned" : 1, | "is_partitioned" : 1, | + | "director_table" : "", | "director_table" : "dir_2", | + | "director_key" : "id", | "director_key" : "dep_id" | + | "latitude_key" : "coord_ra", | } | + | "longitude_key" : "coord_dec" | | + | } | .. code-block:: json | + | | | + | | { "table" : "dep_2_1", | + | | "is_partitioned" : 1, | + | | "director_table" : "dir_2", | + | | "director_key" : "dep_id" | + | | "latitude_key" : "dep_coord_ra", | + | | "longitude_key" : "dep_coord_dec" | + | | } | + +-----------------------------------+-------------------------------------------+ + | .. code-block:: json | No dependents for the *director* table | + | | | + | { "table" : "dir_3", | | + | "is_partitioned" : 1, | | + | "director_table" : "", | | + | "director_key" : "objectId", | | + | "latitude_key" : "ra", | | + | "longitude_key" : "dec" | | + | } | | + +-----------------------------------+-------------------------------------------+ + +.. note:: + + The attributes ``chunk_id_key`` and ``sub_chunk_id_key`` were required in older versions of the API and may still + be present in JSON configurations. However, they are no longer needed for registering tables during ingest. + The role-to-column mapping for these attributes is now predefined in the Ingest system implementation. + The mapping is presented below: + + +----------------------+----------------+ + | role | column | + +======================+================+ + | ``chunk_id_key`` | ``chunkId`` | + +----------------------+----------------+ + | ``sub_chunk_id_key`` | ``subChunkId`` | + +----------------------+----------------+ + + If any of these attributes are found in a configuration, their definitions will be ignored. + diff --git a/doc/ingest/api/advanced/index.rst b/doc/ingest/api/advanced/index.rst new file mode 100644 index 0000000000..fca32c2e3d --- /dev/null +++ b/doc/ingest/api/advanced/index.rst @@ -0,0 +1,26 @@ + +.. _ingest-api-advanced: + +================== +Advanced Scenarios +================== + +.. hint:: + + Read the following document first: + + - :ref:`ingest-api-concepts` (CONCEPTS) + +.. toctree:: + :maxdepth: 4 + + config + charset + async-concurrency + unpublishing + transactions + optimisations + contributions + directors + ref-match + warnings diff --git a/doc/ingest/api/advanced/optimisations.rst b/doc/ingest/api/advanced/optimisations.rst new file mode 100644 index 0000000000..b803546e91 --- /dev/null +++ b/doc/ingest/api/advanced/optimisations.rst @@ -0,0 +1,59 @@ + +.. _ingest-api-advanced-optimisations: + +Optimizations in using the REST services +======================================== + +When designing a workflow, it is crucial to avoid overloading the REST services with repeated or inefficient requests. +Whenever possible, make certain requests once and reuse their results. This is particularly important for workflows +designed for parallel ingests, where the results of some requests can be shared among parallel +activities (processes, etc.) within the workflows. + +While this document does not cover all possible optimizations for interacting with the services, it is +the responsibility of the workflow developer to determine what can be cached or shared based on +the progressive state of the ingested catalog and the organization of the workflow +Below are some of the most useful techniques. + +.. _ingest-api-advanced-optimisations-batch: + +Batch mode for allocating chunks +-------------------------------- + +.. note:: + + This optimization is feasible when all chunk numbers are known upfront. + A common scenario is when the workflow is ingesting a large dataset that has already been + *partitioned* into chunks. In such cases, the chunk numbers are known before the ingestion begins. + +In the example of the :ref:`ingest-api-simple` presented earlier, chunk allocations were made on a per-chunk basis +(:ref:`table-location-chunks-one`). While this method works well for scenarios with a small number of chunks, it may +slow down the performance of workflows ingesting large numbers of chunks or making numerous requests to the chunk +allocation service. This is because chunk allocation operations can be expensive, especially in a Qserv setup with +many pre-deployed chunks. In such cases, chunk allocation requests may take a significant amount of time. +To address this issue, the system provides a service for allocating batches of chunks, as explained in: + +- :ref:`table-location-chunks-many` (REST) + +In the context of the earlier presented example of a simple workflow the chunk allocation request object would +look like this: + +.. code-block:: json + + { "transaction_id" : 123, + "chunks" : [ 187107, 187108, 187109, 187110 ] + } + +The result could be reported as: + +.. code-block:: json + + { "location":[ + { "chunk":187107, "worker":"db01", "host":"qserv-db01", "port":25002 }, + { "chunk":187108, "worker":"db02", "host":"qserv-db02", "port":25002 }, + { "chunk":187109, "worker":"db01", "host":"qserv-db01", "port":25002 }, + { "chunk":187110, "worker":"db02", "host":"qserv-db02", "port":25002 } + ] + } + +The request can be made once, and its results can be distributed among parallel activities within the workflow +to ingest the corresponding chunk contributions. diff --git a/doc/ingest/api/advanced/ref-match.rst b/doc/ingest/api/advanced/ref-match.rst new file mode 100644 index 0000000000..b5f5951e57 --- /dev/null +++ b/doc/ingest/api/advanced/ref-match.rst @@ -0,0 +1,94 @@ +.. _ingest-api-advanced-refmatch: + +Ingesting ref-match tables +========================== + +.. tip:: + + See the following document first: + + - :ref:`ingest-api-concepts-table-types` (CONCEPTS) + +.. note:: + + The input data for *ref-match* tables must be partitioned differently than other subtypes of *partitioned* tables. + Detailed instructions on this topic can be found in the section: + + - :ref:`ingest-data-partitioning-ref-match` (DATA) + +The *ref-match* tables are a specialized class of *partitioned* tables that depend on (match rows of) two *director* tables. +These referenced *director* tables can be located within the same catalog as the *ref-match* table or in any other catalog +served by the same Qserv instance. The only additional requirement in the latter case is that all databases must belong to +the same database *family* (partitioned using the same values for the parameters ``stripes``, ``sub-stripes``, and ``overlap``). +This requirement is enforced by the table registration service of the Replication/Ingest system. From the system's perspective, +these tables are not different from any other *partitioned* tables. The only changes made to the table registration interface +to specifically support *ref-match* tables are redefining (extending) the syntax of the attribute ``director_table`` and adding +four optional attributes allowed in the JSON configurations of the tables as presented below: + +``director_table`` : *string* + A table referenced here must be the **first** *director* table that must be registered in Qserv before the *ref-match* table. + The table registration service will refuse the operation if the *director* doesn't exist. The table name may also include + the name of a database where the table is located if this database differs from the one where the *ref-match* itself + will be placed. The syntax of the parameter's value: + + .. code-block:: + + [.] + + Note that the name cannot be empty, and the database (if specified) or table names should not be enclosed in quotes. + + If the database name is provided, the database should already be known to the Replication/Ingest system. + +``director_key`` : *string* + A non-empty string specifying the name of the primary key column of the referenced *director* table must be provided here. + This column should also be present in the table schema. + +``director_table2`` : *string* + This is the **second** *director* table referenced by the *ref-match* table. The values for this attribute must adhere to the same + requirements and restrictions as those specified for the ``director_table`` attribute. + +``director_key2`` : *string* + A non-empty string specifying the name of the primary key column of the **second** referenced *director* table must be provided here. + This column should also be present in the table schema. Note that the name should be different from the one specified in + the ``director_key`` attribute. + +``flag`` : *string* + The name of a column that stores flags created by the special partitioning tool ``sph-partition-matches``. This column should + also be present in the table schema. Usually, the column has the SQL type ``UNSIGNED INT``. + +``ang_sep`` : *double* + The maximum angular separation (within the spatial coordinate system) between the matched objects. The value of this parameter + must be strictly greater than ``0`` and must not exceed the *overlap* value of the database *family*. The table registration service + will enforce this requirement and refuse to register the table if the condition is violated. + +.. note:: + + Spatial coordinate columns ``latitude_key`` and ``longitude_key`` are ignored for this class of tables. + +Here is an example of the JSON configuration for a *ref-match* table: + +.. code-block:: json + + { "database" : "Catalog-A", + "table" : "RefMatch_A_Object_B_DeepSource", + "is_partitioned" : 1, + "director_table" : "Object", + "director_key" : "objectId", + "director_table2" : "Catalog-B.DeepSource", + "director_key2" : "deepSourceId", + "flag" : "flags", + "ang_sep" : 0.01667, + "schema": [ + {"name" : "objectId", "type" : "BIGINT UNSIGNED"}, + {"name" : "deepSourceId", "type" : "BIGINT UNSIGNED"}, + {"name" : "flag", "type" : "INT UNSIGNED"}, + {"name" : "chunkId", "type" : "INT UNSIGNED"}, + {"name" : "subChunkId", "type" : "INT UNSIGNED"} + ], + "auth_key" : "" + } + +The configuration parameters of the *ref-match* tables can also be seen in the responses of the following REST services: + +- :ref:`ingest-db-table-management-register-table` (REST) +- :ref:`ingest-db-table-management-config` (REST) diff --git a/doc/ingest/api/advanced/transactions.rst b/doc/ingest/api/advanced/transactions.rst new file mode 100644 index 0000000000..07291839c5 --- /dev/null +++ b/doc/ingest/api/advanced/transactions.rst @@ -0,0 +1,277 @@ + +.. _ingest-api-advanced-transactions: + +Transaction management +====================== + +This document presents several advanced recipes related to transaction management in the Ingest API. +Please read the following document first: + +- :ref:`ingest-api-concepts-transactions` (CONCEPTS) + +.. _ingest-api-advanced-transactions-multiple: + +Planning multiple transactions +------------------------------ + +To improve workflow stability, particularly during failures, the system supports distributed transactions. +This method is essential for ensuring stable ingests. Transactions were initially discussed in the section +:ref:`ingest-api-simple`. This section further explores the advantages of this method by detailing the planning +and management of parallel transactions. + +All rows are ingested into the data tables within the scope of transactions. Once a transaction is committed, +all relevant contributions remain in the destination tables. Conversely, if the transaction is aborted, +the rows are removed. The transaction *abort* operation (:ref:`ingest-trans-management-end`) won't revert all +modifications made to tables. It will only remove rows ingested within the corresponding transaction. +For instance, any tables created during transactions will stay in Qserv. Any chunk allocations made during +transactions will also stay. Leaving some tables empty after this operation won't confuse Qserv even if +the tables remain empty after publishing the database. + +When designing a workflow for a specific catalog or a general-purpose workflow, it is crucial to consider potential +failures during ingests. Estimating the likelihood of encountering issues can guide the decision-making process +for planning the number and size of transactions to be started by the workflow. Here are some general guidelines: + +- If the probability of failures is low, it is advisable to divide the input dataset into larger portions + and ingest each portion in a separate transaction. +- Conversely, if the probability of failures is high, using smaller transactions may be more appropriate. + +Another approach is to create a self-adjusting workflow that dynamically decides on transaction sizes based +on feedback from previous transactions. For instance, the workflow could begin with several small transactions +as probes and then progressively increase or decrease the number of contributions per transaction based on the results. +This technique has the potential to enhance workflow performance. + +Other factors influencing the transaction planning process include: + +- **Availability of input data**: Contributions to the catalogs may arrive incrementally over an extended period. +- **Temporary disk space limitations**: The space for storing intermediate products (partitioned CSV files) may be restricted. +- **Qserv configuration**: The number of worker nodes in the Qserv setup can impact the workflow design. + +What is a resonable number of transactions per catalog ingest? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When planning ingest activities, consider the following global limits: + +- The total number of transactions per Qserv instance is capped by an unsigned 32-bit number. + The transaction identifier ``0`` is reserved by the Ingest System, so the maximum number of + transactions is ``4294967295``. +- The total number of transactions per table is limited to ``8192`` due to the MySQL partition + limit. Practically, opening more than ``100`` transactions per database is not advisable because + of the overheads associated with MySQL partitioning. + +Another factor to consider is the implementation of transactions. The Ingest system directly maps transactions +to MySQL table partitions. Each partition is represented by two files in the filesystem of the worker where +the corresponding table resides (in the current implementation of Qserv, the data tables use the ``MyISAM`` storage engine): + +- ``#p.MYD``: The data file of the MySQL partition. +- ``#p.MYI``: The index file of the MySQL partition. + +In the extreme case, the number of files representing chunked tables would be roughly equal to the total number of +chunks multiplied by the number of transactions open per catalog. For example, if there are ``150,000`` chunks in +a catalog and ``10`` transactions are open during the catalog ingest, the total number of files spread across +all workers could be as many as ``3,000,000``. If the number of workers is ``30``, then there would be +approximately ``100,000`` files per worker's filesystem, all in a single folder. + +In reality, the situation may not be as severe because the chunks-to-transactions "matrix" would be rather sparse, +and the actual number of files per directory could be about ``10`` times smaller. Additionally, all MySQL table partitions will +be eliminated during the catalog *publishing* phase. After that, each table will be represented +with the usual three files: + +- ``.frm``: The table definition file. +- ``.MYD``: The data file. +- ``.MYI``: The index file. + +Nevertheless, it is advisable to avoid opening thousands of transactions per catalog ingest, even though the *hard* +limit for MySQL partitions per table might seem quite high at ``8192``. + +.. _ingest-api-advanced-transactions-parallel: + +Parallel transactions +--------------------- + +This section covers some parallel ingest scenarios that may increase the overall performance of a workflow. + +Ingesting chunks in parallel within a single transaction +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This is the simplest scenario that assumes the following organization of the workflow: + +#. **Sequential**: Start a common transaction before uploading the first chunk. +#. **Parallel**: For each chunk: + + #. **Sequential**: Allocate a chunk + #. **Sequential**: Ingest contributions into each chunk. + +#. **Sequential**: Commit the common transaction after all contributions are successfully uploaded. + +The following diagram illustrates the idea: + +.. image:: /_static/ingest-trans-multiple-one.png + :target: ../../../_images/ingest-trans-multiple-one.png + :alt: One Transaction + +Things to consider: + +- The chunk allocation operations are serialized in the current version of the system. This may introduce + indirect synchronization between parallel chunk-specific ingests. The total latency incurred by such synchronization + is the latency of allocating one chunk multiplied by the number of chunks. +- The proposed scheme may not be very efficient if the number of chunks is large (heuristically, many thousands) + while chunk contributions are small. In this case, the latency of the chunk allocation requests may become a significant + factor limiting the performance of the workflow. +- Any failure to ingest a contribution will result in aborting the entire transaction. This can significantly + impact the workflow's performance, especially if the amount of data to be ingested is large. + impact the workflow's performance, especially if the amount of data to be ingested is large. + +Best use: + +- When the number of chunks is small and the amount of data to be ingested into each chunk is large, or + if the amount of data or the number of contributions to be ingested into each chunk is large. In this case + negative effects of the chunk allocation latency are negligible. + +Ingesting chunks in parallel within dedicated transactions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This is a more complex scenario that assumes the following organization of the workflow: + +- **Parallel**: For each chunk do the following: + + #. **Sequential**: Start a separate transaction dedicated for ingesting all contributions of the chunk. + #. **Sequential**: Allocate the chunk and ingest all contributions into the chunk. + #. **Sequential**: Commit the transaction after all contributions into the chunk are successfully uploaded. + +The following diagram illustrates the idea: + +.. image:: /_static/ingest-trans-multiple-chunks.png + :target: ../../../_images/ingest-trans-multiple-chunks.png + :alt: Per-chunk Transaction + +Things to consider: + +- Although this scheme assigns each chunk to a dedicated transaction, it is not strictly necessary. + The Ingest system allows allocating the same chunk and ingesting contributions into that chunk from any (or multiple) transactions. + Just ensure that the same set of rows (the same set of contributions) is not ingested within more than one transaction. + This rule applies to any workflow regardless. +- Failures in one chunk transaction will not affect chunk contributions made in the scope of other transactions. + This is a significant advantage of this scheme compared to the previous one. + +Best use: + +- When ingesting a large dataset, it can be divided into independently ingested groups based on chunks. + Transactions offer a mechanism to handle failures effectively. + + +Scattered ingest of chunk contributions within multiple transactions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Workflow organization: + +- **Parallel**: For each transaction do the following: + + #. **Sequential**: Start a transaction dedicated for ingesting a subset of contributions of any chunks that + may be related to cteh contributions of teh subset. + #. **Sequential**: For each contribution in the subset: + + #. **Sequential**: Allocate a chunk as needed for the contribution. + #. **Sequential**: Ingest the contributions into the chunk. + + #. **Sequential**: Commit the transaction after ingesting all contributions in the subset. + +The following diagram illustrates the idea: + +.. image:: /_static/ingest-trans-multiple-scattered.png + :target: ../../../_images/ingest-trans-multiple-scattered.png + :alt: Scattered Transactions + +Best use: + +- When the workflow is designed to ingest a large dataset where data are streamed into the workflow. + This scenario is particularly useful when the data are not available in a single file or when the data + are generated on-the-fly by some external process. + +.. tip:: + + One can combine the above scenarios to create a more complex workflow that meets the specific requirements + of the ingest process. + +.. _ingest-api-advanced-transactions-abort: + +Aborting transactions +---------------------- + +The concept of distributed transactions was introduced in the section :ref:`ingest-api-concepts-transactions`. Transactions +are a fundamental mechanism for ensuring the consistency of the ingest process. The system allows aborting transactions +to revert the effects of all contributions made to the catalogs within the scope of the transaction. This operation is particularly useful +when the ingest process encounters an issue that cannot be resolved by the system automatically, or when the failure leaves +the data or metadata tables in an inconsistent state. Transactions are aborted using the following service: + +- :ref:`ingest-trans-management-end` (REST) + +Reasons to abort +^^^^^^^^^^^^^^^^ + +There are two primary reasons for aborting a transaction, detailed in the subsections below. + +Communication Failures +~~~~~~~~~~~~~~~~~~~~~~ + +If any communication problem occurs between the workflow and the system during a contribution request, the workflow **must** unconditionally +abort the corresponding transaction. Such problems create uncertainty, making it impossible to determine if any actual changes were made to +the destination tables. + +This rule applies universally, regardless of the method used for making the contribution request (by reference, by value, synchronous, asynchronous, etc.). + +Ingest System Failures +~~~~~~~~~~~~~~~~~~~~~~ + +Unlike the previously explained scenario, this scenario assumes that the workflow can track the status of attempted contribution requests. +The status information is reported by the ingest system. The workflow can detect a failure in the response object and decide to abort +the transaction. However, the analysis of the failure is done slightly differently for *synchronous* and *asynchronous* requests. + +The algorithm for the *synchronous* requests is rather straightforward. If the attribute ``status`` of the response object +indicates a failure as ``status=0``, the workflow must analyze the ``retry-allowed`` flag in :ref:`ingest-worker-contrib-descriptor` (REST). +If the flag is set to ``0``, the transaction must be aborted. If the flag is set to ``1``, the workflow can retry the contribution request +within the scope of the same transaction using the following service: + +- :ref:`ingest-worker-contrib-retry` (REST) + +The algorithm for the *asynchronous* requests is a bit more complex. The response object for the contribution submission request does not contain +the actual completion status of the request. If the request submission was not successful as indicated by ``status=0``, it means the request was incorrect or +made in a wrong context (no transaction open, non-existing table, etc.). In this case, the workflow must abort the transaction. +Otherwise (the response object has ``status=1``), the actual status of the contribution request can be obtained later by polling the system +as explained in the section: + +- :ref:`ingest-api-concepts-contributions-status` (CONCEPTS) + +The REST services explained in this section return the contribution descriptor object that contains the status of the contribution request. +The workflow must first check if a contribution has finished (or failed) or if it's still in progress (or in the wait queue of the processor). + +- :ref:`ingest-worker-contrib-descriptor` (REST) + +Possible values of the attribute ``status`` (**Note** this is an attribute of the contribution itself not the completion status of teh REST request) +are explained in the above-mentioned document. Any value other than ``IN_PROGRESS`` indicates that the contribution request has finished (or failed). +Should the request fail, the workflow must then analyze the flag ``retry-allowed`` as explained above. + +What happens when a transaction is aborted? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Aborting a transaction is a relatively quick operation. The primary change involves the removal of MySQL table partitions associated with the transaction. +The following table files on disk will be deleted: + +- ``#p.MYD``: The data file of the MySQL partition. +- ``#p.MYI``: The index file of the MySQL partition. + +All queued or in-progress contribution requests will be dequeued or stopped. The final status of the requests will be either ``CANCELLED`` (for requests +that were still in the queue) or some other failure state depending on the processing stage of a request. The system will not attempt to process +them further. + +What to do if a transaction cannot be aborted? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It's possible that the system will not be able to abort a transaction. For example, if one of the workers is down or is not responding to the abort request. +In such cases, the status of the transaction will be ``IS_ABORTING`` or ``ABORT_FAILED`` as explained in the section: + +- :ref:`ingest-trans-management-states` (CONCEPTS) + +If the transaction cannot be aborted, the workflow developer must be prepared to handle the situation. There are a few options: + +- The workflow may be programmed to retry the abort operation after a certain timeout. +- If retrying doesn't help, the user of the workflow should contact the Qserv administrators to resolve the issue. diff --git a/doc/ingest/api/advanced/unpublishing.rst b/doc/ingest/api/advanced/unpublishing.rst new file mode 100644 index 0000000000..7eecd383c4 --- /dev/null +++ b/doc/ingest/api/advanced/unpublishing.rst @@ -0,0 +1,54 @@ + + +.. _ingest-api-advanced-unpublishing-databases: + +Ingesting tables into the published catalogs +-------------------------------------------- + +.. warning:: + + Currently, the ingest system only supports adding new tables to databases. It does not permit adding rows to previously + ingested tables. Any attempts to modify existing tables will be blocked by the system. + +In some cases, especially when ingesting large-scale catalogs, the input data for all tables may not be available at the start +of the ingest campaign. Some tables may be ready for ingestion earlier than others. Another scenario is when a previously +ingested table needs to be re-ingested with corrected data. In these situations, the catalog must be built incrementally +while allowing Qserv users to access the previously published tables. The previously described workflow of ingesting all +tables at once and then publishing the catalog as a whole would not work here. To address these scenarios, the system allows +temporarily *un-publishing* the catalog to add new or replace existing tables. The following REST service should be used for +this: + +- :ref:`ingest-db-table-management-unpublish-db` (REST) + +Key points to note: + +- This operation is very quick. +- The database state transition is largely transparent to Qserv users, except when replacing an existing table with a newer + version. The un-published database, including all previously ingested tables, will still be visible and queryable by Qserv + users. +- The operation requires the ingest workflow to use an administrator-level authorization key. This will be demonstrated in + the example below. + + +The modified workflow sequence expected in this case is as follows: + +#. Unpublish the existing catalog. +#. Delete an existing table if it needs to be replaced. +#. Register a new table (or a new version of the removed table) or multiple tables as needed. +#. Start transactions. +#. Load contributions for the new tables. +#. Commit transactions. +#. Publish the catalog again. + +This sequence can be repeated as needed to modify the catalog. Note that starting from step **3**, this sequence +is no different from the simple scenario of ingesting a catalog from scratch. The last step of the sequence +will only affect the newly added tables. Hence, the performance of that stage will depend only on the scale and the amount +of data ingested into the new tables. + +Here is an example of how to unpublish a catalog: + +.. code-block:: bash + + curl http://qserv-master01:25081/replication/config/database/test101 \ + -X PUT -H 'Content-Type: application/json' \ + -d'{"admin_auth_key":""}' diff --git a/doc/ingest/api/advanced/warnings.rst b/doc/ingest/api/advanced/warnings.rst new file mode 100644 index 0000000000..063986032c --- /dev/null +++ b/doc/ingest/api/advanced/warnings.rst @@ -0,0 +1,106 @@ + +.. _ingest-api-advanced-warnings: + +Using MySQL warnings for the data quality control +================================================= + +The context +----------- + +The table ingest is presently implemented using MySQL/MariaDB bulk insert statement: + +.. code-block:: sql + + LOAD DATA [LOCAL] INFILE ... + +This is currently the most efficient and performant method for adding rows into tables from input CSV files (:ref:`ingest-api-concepts-contributions`). +The technique is detailed in https://mariadb.com/kb/en/load-data-infile/. + +This method differs significantly from the standard SQL ``INSERT``. One caveat of using this mechanism is that MySQL (MariaDB) +attempts to ingest all input data into the table, even if some rows (or fields within rows) cannot be correctly interpreted. Consequently: + +- The table may contain fewer (or sometimes more) rows than expected. +- Some cell values may be truncated. +- Some cells may contain incorrect data. + +In order to help client applications detect these problems, MySQL offers diagnostic tools (queries, counters) that report +internal issues encountered during data loading. These are detailed in https://mariadb.com/kb/en/show-warnings/. + +The current implementation of the Ingest system leverages these features by capturing warnings (as well as notes and errors) +and recording them within the Replication database in association with the corresponding contribution requests. This is done +for each request regardless of how it was submitted, whether via the proprietary binary protocol of +:ref:`ingest-tools-qserv-replica-file`, or by calling the REST services of the Ingest system: + +- :ref:`ingest-worker-contrib-by-ref` (WORKER) +- :ref:`ingest-worker-contrib-by-val` (WORKER) + +Both interfaces also offer a parameter to control the depth of the warning reports by specifying the desired limit on +the number of warnings to be retained for each request. This limit is optional. If not specified at the time of request +submission, the service will use the limit configured at the worker ingest server's startup. + +The REST services that return information on contributions have another optional parameter that indicates whether the client +is interested in seeing just the total number of warnings or the complete description of all warnings retained by the system. +The effect of this parameter on the resulting JSON object returned by the services is explained in: + +- :ref:`ingest-worker-contrib-descriptor` (WORKER) + +In addition to the individual descriptions (if required) of the warnings, the relevant services also report three summary counters: + +``num_warnings``: + The total number of warnings detected by MySQL when loading data into the destination table. Note that this number is not + the same as the number of warning descriptions returned by the REST services. Unlike the latter, ``num_warnings`` represents + the true number of warnings. Only a subset of those is captured in full detail by MySQL. + +``num_rows``: + The total number of rows parsed by the Ingest system in the input file. The ingest service always parses the input files as + it needs to extend each row in order to prepend them with a unique identifier of the corresponding super-transaction (the name + of the added column is ``qserv_trans_id``). + +``num_rows_loaded``: + The total number of rows that were actually loaded by the system into the destination table. Note that in case MySQL + encountered any problems with the input data while interpreting and ingesting those into the destination table, this + counter may not be the same as ``num_rows``. In practice, a value reported in ``num_rows_loaded`` could be either lower or + higher than the value reported in ``num_rows``. + +Using warnings and counters +--------------------------- + +The interfaces described above provide a range of options for workflow developers and Qserv data administrators: + +- Workflow developers can enhance their workflows to analyze the reported counters for contributions. This helps determine + if an ingest operation was genuinely successful or if issues occurred. +- Data administrators can utilize both counters and warning descriptions to analyze ingest results and debug input data. + Any data issues can be traced back to their source (e.g., LSST Pipeline). + +The following subsections present techniques that can be leveraged in this context. + +Analyzing counters +^^^^^^^^^^^^^^^^^^ + +The ingest operation should be considered successful if both of the following conditions are met: + +- ``num_warnings`` equals to ``0`` +- ``num_rows`` is the same as ``num_rows_loaded`` + +If these conditions are not met, a data administrator should inspect the warning descriptions in detail to identify the cause +of the discrepancy. + +Increasing the depth of the warnings queue +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. hint:: + + The default imposed by MySQL would be ``64``. And the upper bound for the limit is ``65535``. + +Significantly increasing the limit above the default value should be considered a temporary measure. All warnings are recorded +within the persistent state of the Replication/Ingest system, and the database serving the system may have limited storage +capacity. Capturing many millions of descriptions across all contributions when ingesting medium-to-large scale catalogs may +also significantly reduce the overall performance of the ingest system. + +Data administrators may temporarily increase the upper limit for the number of warnings to debug input data. The limit can be +set when submitting contribution requests via the APIs mentioned earlier in this chapter. Alternatively, the Replication/Ingest +worker server can be started with the following command-line option: + +.. code-block:: bash + + qserv-replica-worker --worker-loader-max-warnings= diff --git a/doc/ingest/api/concepts/contributions.rst b/doc/ingest/api/concepts/contributions.rst new file mode 100644 index 0000000000..6053d0eff3 --- /dev/null +++ b/doc/ingest/api/concepts/contributions.rst @@ -0,0 +1,87 @@ + +.. _ingest-api-concepts-contributions: + +Table contributions +=================== + +The API defines a *contribution* as a set of rows ingested into a table via a separate request. +These are the most important characteristics of contributions: + +- Each contribution request is always made within the scope of a transaction. This association + is crucial for data provenance and data tagging purposes. +- Information on contributions is preserved in the persistent state of the Ingest system. +- Contributions have unique identifiers assigned by the Ingest system. + +.. _ingest-api-concepts-contributions-atomicity: + +Request atomicity and retries +----------------------------- + +The contribution ingest requests are considered as atomic operations with a few important caveats: + +- The contributions are not committed to the table until the transaction is committed even + if the contribution request was successful. + +- Failed contribution requests must be evaluated by the workflow to determine if the target table + remains in a consistent state. This is indicated by the ``retry-allowed`` attribute returned + in the response object. Based on the value of this flag, the workflow should proceed as follows: + + - ``retry-allowed=0``: The workflow must roll back the transaction and initiate a new + contribution request within the scope of a new transaction. For more details, refer to: + + - :ref:`ingest-api-advanced-transactions-abort` (ADVANCED) + - :ref:`ingest-trans-management-end` (REST) + + - ``retry-allowed=1``: The workflow can retry the contribution within the scope of the same + transaction using: + + - :ref:`ingest-worker-contrib-retry` (REST) + +Note that for contributions submitted by reference, there is an option to configure a request +to automatically retry failed contributions. The maximum number of such retries is controlled +by the ``num_retries`` attribute of the request: + +- :ref:`ingest-worker-contrib-by-ref` (REST) + +Contributions pushed to the service by value can not be automatically retried. The workflow +would have to decide on the retrying the failed contributions explicitly. + +Multiple contributions +---------------------- + +When ingesting rows into a table (whether *partitioned* or *regular*), the workflow does not need to complete +this in a single step from one input file. The Ingest system supports building tables from multiple contributions. +Contributions can be made within different transactions or multiple contributions can be ingested within the same transaction. +It is the responsibility of the workflow to keep track of what has been ingested into each table, within which transaction, +and to handle any failed transactions appropriately. + +Ingest methods +-------------- + +Data (rows) of the contributions are typically stored in the ``CSV``-formatted files. In this +case the files would be either directly pushed to the worker Ingest server or uploaded by +the Ingest service from a location that is accessible to the worker: + +- :ref:`ingest-worker-contrib-by-val` (REST) +- :ref:`ingest-worker-contrib-by-ref` (REST) + +The first option (ingesting by value) also allows pushing data of contributions +directly from the memory of the client process (worklow) w/o the need to store the data in the files. + +.. _ingest-api-concepts-contributions-status: + +Status of the contribution requests +----------------------------------- + +The system allows to pull the information on the contributions given their identifiers: + +- :ref:`ingest-info-contrib-requests` (REST) + +An alternative option is to query the information on contributions submitted in a scope of +a transaction: + +- :ref:`ingest-trans-management-status-one` (REST) + +The schema of the contribution descriptor objects is covered by: + +- :ref:`ingest-worker-contrib-descriptor` diff --git a/doc/ingest/api/concepts/families.rst b/doc/ingest/api/concepts/families.rst new file mode 100644 index 0000000000..5ce1d13e55 --- /dev/null +++ b/doc/ingest/api/concepts/families.rst @@ -0,0 +1,37 @@ + +.. _ingest-api-concepts-database-families: + +Database families +================= + +The concept of *database families* originated from the need to correctly distribute data of the *partitioned* tables within Qserv. This allows +Qserv to accurately process queries that ``JOIN`` between the tables of different databases. A database family is a group of databases where +all tables within the family share the same partitioning parameters: + +- the number of *stripes* +- the number of *sub-stripes* +- the *overlap* radius + +This will ensure that all *chunk* tables with the same chunk number will have: + +- the same spatial dimensions in the coordinate system adopted by Qserv +- the same number and sizes of *sub-chunks* within each chunk. + +The families are defined by the Replication/Ingest system and are not visible to Qserv users. Each family has a unique +identifier (name). The system uses family names to correctly distribute the data of partitioned tables among Qserv worker +nodes, ensuring that the data of tables joined in queries are co-located on the same worker node. + +Families must be defined before the databases and tables are registered in Qserv. The current implementation of the API +automatically creates a new family when the first database with a unique combination of partitioning parameters is registered +in the system using: + +- :ref:`ingest-db-table-management-register-db` (REST) + +If a family with the same partitioning parameters already exists in the system, the new database will be added to the existing family. +Existing databases and families can be found using the following service: + +- :ref:`ingest-db-table-management-config` (REST) + +For instructions on partitioning the tables with the desired set of parameters, refer to the following document: + +- :ref:`ingest-data` (DATA) diff --git a/doc/ingest/api/concepts/index.rst b/doc/ingest/api/concepts/index.rst new file mode 100644 index 0000000000..65bccd87ce --- /dev/null +++ b/doc/ingest/api/concepts/index.rst @@ -0,0 +1,35 @@ + +.. _ingest-api-concepts: + +============= +Main concepts +============= + +.. hint:: + + This section of the document begins with the high-level overview of the Qserv ingest API. + Please read this section carefully to learn about the main concepts of the API and a sequence + of operations for ingesting catalogs into Qserv. + + After completing the overview, a reader has two options for what to read next: + + - Study the core concepts of the API in depth by visiting subsections: + + - :ref:`ingest-api-concepts-table-types` + - :ref:`ingest-api-concepts-transactions` + - :ref:`ingest-api-concepts-publishing-data` + - etc. + + - Go straight to the practical example of a simple workflow presented at: + + - :ref:`ingest-api-simple` + +.. toctree:: + :maxdepth: 4 + + overview + table-types + transactions + contributions + publishing + families diff --git a/doc/ingest/api/concepts/overview.rst b/doc/ingest/api/concepts/overview.rst new file mode 100644 index 0000000000..35b94d31d7 --- /dev/null +++ b/doc/ingest/api/concepts/overview.rst @@ -0,0 +1,254 @@ +.. _ingest-api-concepts-overview: + +Overview of the ingest workflow +=============================== + +The ingest workflow must accomplish a series of tasks to ingest data into Qserv. +These tasks are presented in the correct order below: + +Plan the ingest +--------------- + +.. hint:: + + You may also contact Qserv experts or Qserv administrators to get help on the planning stage. + +There is a number of important questions to be answered and decisions to be made ahead of time in the following +areas before starting ingesting data into Qserv. Knowing these facts allows to organize the ingest activities in +the most efficient way. + +Creating a new database or adding tables to the existing one? + +- :ref:`ingest-api-concepts-publishing-data` (CONCEPTS) + +What are the types of tables to be ingested? + +- :ref:`ingest-api-concepts-table-types` (CONCEPTS) + +What should be the values of the partitioning parameters of the partitioned tables? + +- :ref:`ingest-api-concepts-database-families` (CONCEPTS) + +What is a scale of the planned ingest effort? + +- The amount of data (rows, bytes) to be ingested in each table +- The number of the ``CSV`` files to be ingested +- Sizes of the files +- The number of the workers that are available in Qserv + +Where the ready to ingest data will be located? + +- Are there any data staging areas available? +- :ref:`ingest-api-advanced-contributions` (ADVANCED) + +Prepare the input data +---------------------- + +Data files (table *contributions*) to be ingested into Qserv need to be in the ``CSV`` format. It's up to the workflow +to ensure that the data is in the right format and that it's sanitized to ensure the values of the columns +are compatible with the MySQL expectations. + +- :ref:`ingest-data` (DATA) + +Note that the data preparation stage depends on the types of tables to be ingested. Read about the table types in: + +- :ref:`ingest-api-concepts-table-types` (CONCEPTS) + +Register or un-publish a database +--------------------------------- + +The main goal of this step is to ensure that the database is ready for registering new tables. Firstly, +the database should be registered in the Replication/Ingest system. Secondly, the database should be +found (or put into) the *unpublished* state. Read about the database states in the following document +section: + +- :ref:`ingest-api-concepts-publishing-data` (CONCEPTS) + +Further steps depend on the state of the database. If the database doesn't exists in the Replication/Ingest system +it should be registered using: + +- :ref:`ingest-db-table-management-register-db` (REST) + +The newely registered database will be always in the *unpublished* state. If the database already exists in +the Replication/Ingest and it's in the *published* state it should be *unpublished* state using: + +- :ref:`ingest-db-table-management-unpublish-db` (REST) + +Register tables +--------------- + +Before ingesting data into Qserv the corresponding tables should be registered in the Replication/Ingest system. +Tables are registered using: + +- :ref:`ingest-db-table-management-register-table` (REST) + +Table registration requests should includes various information on each table, such as: + +- the name of the database where the table belongs +- the name of the table +- the type of the table +- the schema + +Detailed instructions on this subjects can be found in the description of the service mentioned above. + +Configure the Ingest service +---------------------------- + +This step is optional. And it's mostly needed to adjust the default configuration parameters of the Ingest service +to allow pulling contributions from the data staging areas, such as web servers, cloud storage, etc. Examples of +the configuration parameters are: timeouts, the number of parallel requests, SSL/TLS certificates, HTTP/HTTPS proxy +settings, etc. More information on this subject can be found in: + +- :ref:`ingest-config` (REST) + +These parameters can be adjusted in real time as needed. The changes get into effect immediately. Note that +the parameters are set on the database level. For example, the configuration parameters set for the database ``db1`` +will not affect the ingest activities for the database ``db2``. + +.. note:: + + Please be aware that the ingest activities can also be affected by the global configuration parameters of + the Replication/Ingest system: + + - :ref:`ingest-api-advanced-global-config` (ADVANCED) + +Start transactions +------------------ + +Making the right choices on how many transactions to start and how many contributions to send in a scope of each transaction +is a key to the ingest performance. The transactions are used to group the contributions. In some cases, when +contributions fail the transactions should be aborted. Should this happen all ingest efforts made in the scope of +the failed transactions would have to be rolled back, and the workflow would have to start the corresponding ingest +activities from the beginning. Hence the workflow should be prepared to handle the transaction aborts and make +reasonable decisions on the amount of data to be sent in a scope of each transaction (a "size" of the transaction) +based on the risk assesment made by the workflow developers or the data administrators who would be using the workflow +for ingesting a catalog. + +.. hint:: + + It's recommended to make the transaction management logic of the workflow configurable. + +More information on this subject can be found in: + +- :ref:`ingest-api-concepts-transactions` (CONCEPTS) +- :ref:`ingest-api-advanced-transactions` (ADVANCED) +- :ref:`ingest-trans-management-start` (REST) + +Figure out locations of tables and chunks +----------------------------------------- + +The design of the API requires the workflow to know the locations of the tables and chunks at workers. +The locations are needed to forward the table contribution requests directly to the corresponding worker +services. The locations can be obtained using services covered in the following document: + +- :ref:`table-location` (REST) + +Send the data to the workers +---------------------------- + +At this stage the actual ingest activities are started. The reader should read the following document document first +to understand the concepts of the *contributions*: + +- :ref:`ingest-api-concepts-contributions` (CONCEPTS) + +The REST API for initiating the contribuiton requests is covered in the following documents: + +- :ref:`ingest-worker-contrib-by-ref` (REST) +- :ref:`ingest-worker-contrib-by-val` (REST) + +Monitor the progress of the ingest activities +---------------------------------------------- + +The workflow should always be avare about the progress of the ingest activities, and about the status of the +contribution requests. This is need for (at least) three reasons: + +#. To know when the ingest activities are finished +#. To know when the ingest activities (and which requests) are failed +#. To make more contribution requests if needed + +In the simplest *linear* design of the workflow, such as the one presented in the :ref:`ingest-api-simple`, +the workflow may implement the monitoring as a separate step after making all contribution requests. In more +realistic scenarious the monitoring stage should be an integral part of the same logic that is responsible for +making the contribution requests. + +Besides the monitoring of the contribution requests the workflow should also monitor the status of the databases, +transactions and Qserv workers to be sure that the ingest activities are going as planned and that the underlying +services are healthy. These are the relevant services for the monitoring: + +- :ref:`ingest-config-global-workers` (REST) +- :ref:`ingest-trans-management-status` (REST) + +Commit/abort the transactions +----------------------------- + +Once all contributions are successfully ingested the transactions should be commited. If any problems occured within +the transactions the workflow should be prepared to handle the transaction aborts. Both operations are performed by: + +- :ref:`ingest-trans-management-end` (REST) + +Read more about the transactions and transaction aborts in: + +- :ref:`ingest-api-concepts-transactions` (CONCEPTS) +- :ref:`ingest-api-advanced-transactions-abort` (ADVANCED) + +Another option in the case of a catastrophic failure during the ingest would be to scrap the whole database +or the tables and start the ingest activities from the beginning. This is a more radical approach, but it's +sometimes the only way to recover from the failure. The services for deleting the database and the tables are +covered in: + +- :ref:`ingest-db-table-management-delete` (REST) + +.. warning:: + + The deletion of the database or the tables is an irreversible operation. Use it with caution. + +Publish the database +-------------------- + +.. warning:: + + Depending on the types of tables created by the workflow, the amount of data ingested into the tables, + and the number of transactions created during the effort, the database publishing operation may take a while. + There is always a chance that it may fail should anything unpredicted happen during the operation. This could be + a problem with the underlying infrastructure, the network, the database, the workers, etc. Or it could be a problem + with the ingested data. The workflow should be prepared to handle the failure of the database publishing operation + and check the completion status of the request. + +.. hint:: + + The current implementation of the operation is *synchronous*, which means the workflow would have to wait + before the service sends back a response to be analyzed. However, the implementation of the operation is *idempotent*, + which means the workflow can retry the operation as many times as needed without any side effects should any network + problems occur during the operation. + +Formally, this would be the last stage of the actual ingest. The database and the tables are published to make them +visible to the users. The database and the tables are published using the following services: + +- :ref:`ingest-db-table-management-publish-db` (REST) + +All new tables that were registered in the database by the workflow would be published automatically. +And the database would be placed into *published* state. + +Read more on this concept in: + +- :ref:`ingest-api-concepts-publishing-data` (CONCEPTS) + +Verify the ingested data products +--------------------------------- + +This step is optional. A possibility of implementing the automatic verification if the ingested +data products are correct and consistent depends on the workflow requirements and the data. +These are some very basic verification steps that the workflow may want to implement: + +- the data can be queried +- the data can be compared to the original data +- the number of rows in the tables is correct + +Perform the optional post-ingest data management operation on the ingested tables +--------------------------------------------------------------------------------- + +This step is optional. The workflow may want to perform some post-ingest data management operations on the ingested tables. +An alternative approach is to perform these operations after verifying the ingested data products. +These operations are covered in: + +- :ref:`ingest-api-post-ingest` (API) diff --git a/doc/ingest/api/concepts/publishing.rst b/doc/ingest/api/concepts/publishing.rst new file mode 100644 index 0000000000..bc27bdfc92 --- /dev/null +++ b/doc/ingest/api/concepts/publishing.rst @@ -0,0 +1,68 @@ + +.. _ingest-api-concepts-publishing-data: + +Publishing databases and tables +=============================== + +Databases +--------- + +Databases in Qserv can be in one of two states: *published* or *unpublished*. Databases in the *published* state +are visible to Qserv users and can be queried. Generally, databases in this state are considered static and cannot be modified. +However, certain operations are still allowed on the tables of published databases. These operations are documented in +the following section: + +- :ref:`ingest-api-post-ingest` + +Databases in the *published* state are also subject to routine replica management operations performed by +the Qserv Replication system. + +The *unpublished* state is reserved for making significant changes to the table data, the table schema, ingesting new tables, etc. +The replica management operations are not performed on the databases in this state in order to avoid conflicts with the ongoing changes. + +Note that newly created databases are always in the *unpublished* state, and they are not visible to the Qserv users. When all desired +tables are ingested into the database and the database is ready for querying, it should be *published* using the following service: + +- :ref:`ingest-db-table-management-publish-db` (REST) + +.. note:: + + Before a database can be published, all transactions open within the context of the database must be either committed or rolled back + using the following service: + + - :ref:`ingest-trans-management-end` (REST) + + If this condition is not met, the database publishing service will reject the request. It is the responsibility of the workflow + to manage these transactions. + +Databases can also be unpublished to allow adding new tables, or for performaging significant changes to the table data or schema +using the following service: + +- :ref:`ingest-db-table-management-unpublish-db` (REST) + +.. note:: + + The database unpublishing service does not affect the visibility of the database to Qserv users. All tables that existed + in the database before unpublishing can still be queried by Qserv users. The unpublishing operation is transparent to the users. + +Tables +------ + +Tables in Qserv can be in one of two states: *published* or *unpublished*. This distinction is relevant only for tables within +databases that are in the *unpublished* state. Unlike databases, the state of a table indicates whether the table is fully ingested +and should not be modified thereafter, regardless of the database's state. + +Newly created tables are always in the *unpublished* state. Once all desired data is ingested into the table and it is ready for querying, +it should be published. This occurs indirectly when the database is published. After publication, the table is marked as *published* +and becomes visible to Qserv users. + +During the table publishing stage the Replication/Ingest system: + +- removes MySQL partitions from the data tables at workers +- (optionally) removes MySQL partitions from the *director* index table + +The last step is optional. It only applies to the *director* tables, and only if the database was registered with +the optional attribute set as ``auto_build_secondary_index=1`` when calling the service: + +- :ref:`ingest-db-table-management-register-db` (REST)` + diff --git a/doc/ingest/api/concepts/table-types.rst b/doc/ingest/api/concepts/table-types.rst new file mode 100644 index 0000000000..c15720ae64 --- /dev/null +++ b/doc/ingest/api/concepts/table-types.rst @@ -0,0 +1,79 @@ +.. _ingest-api-concepts-table-types: + +Types of tables in Qserv +======================== + +There are two types of tables in Qserv: + +- *regular* (fully replicated) +- *partitioned* (distributed) + +A copy of the regular table exists at each worker node. Tables of this type are relatively small so that they can fit on disk +at each worker. The tables are usually meant to store the metadata or the reference data for the large partitioned tables. +The following diagram illustrates the concept: + +.. image:: /_static/ingest-table-types-regular.png + :target: ../../../_images/ingest-table-types-regular.png + :alt: Regular (fully-replicated) Tables + +The partitioned tables are distributed across the workers. These tables are much larger than the regular tables and they can't +fit on disk at each worker. Each such table is *horisontally* (by rows) partitioned into the so called *chunks* (or *chunk tables*). Each chunk +table is a separate table that is stored on the MySQL server of a worker. Depending on values of the partitioning parameters (specifically +on th number of *stripes*) a catalog may have from 10,000 to over 100,000 chunks. The names of the chunk tables are based on the name of +the original table, the chunk number and the optional "FullTableOverlap" suffix after the base name of the table. See the following +section for more information on the naming convention of the tables in Qserv: + +- :ref:`ingest-general-base-table-names` + +Each chunk table has a subset of the rows of the original table. A superposition of rows from all the chunk tables of the same Qserv table +is equal to a set of rows in the original (*base*) table. The following diagram illustrates the concept: + +.. image:: /_static/ingest-table-types-partitioned.png + :target: ../../../_images/ingest-table-types-partitioned.png + :alt: Partitioned (chunk) Tables + + +Note that each chunk of the partitioned table maps to rectangular sector of the Sky based on the spatial coordinates system adopted by Qserv. +Spatial coordinates of all rows within a chunk table are all found within the spatial area of the chunk. The spatial areas of different +chunks never overlap. + +.. note:: + + The chunk *overelap* table includes a "halo" of rows from the neighboring chunks. The size of the overlap is defined by the *overlap* parameter + of the table. The overlap is used to ensure that the rows that are close to the chunk boundary are not missed by the so called "near-neighbour" + queries. These table are explained late rin this section. + +The chunk tables are made by the partitioning process that is documented in: + +- :ref:`ingest-data` + +The partitioned tables are further classified into the following subtypes: + +- *director* tables +- *dependent* tables, which are further classified into: + + - *simple* (1 director) + - *ref-match* (2 directors) + +The *director* tables are the tables in which each row has a unique identifier which is similar to the *primary key* in the relational algebra. +The *dependent* tables have rows which depend on the rows of the corresponding *director* tables via the *foreign*-like key referencing +the corresponing *primary key*. The *simple* tables have only one *director* table, while the *ref-match* tables have two *director* tables. +The *ref-match* tables are used to store the matches between the objects of the two different tables. The following diagram illustrates these +concepts: + +.. image:: /_static/ingest-table-types-dependent.png + :target: ../../../_images/ingest-table-types-dependent.png + :alt: Dependent Tables + +The *director* tables may not have any *dependent* tables. Each such *director* is useable and queriable by itself. The *dependent* tables +must have the corresponding *director* tables. Same rules apply to the *ref-match* tables. + +Each chunk table of the director table has the corresponfing chunk *overlap* table. The *overlap* table includes a subset of rows from the chunk table +and a "halo" of rows from the neighboring chunks. The size of the overlap is defined by the *overlap* parameter of the table. The idea of the overlap +is illustrated in the following diagram: + +.. image:: /_static/subchunks.png + :target: ../../../_images/subchunks.png + :alt: Dependent Tables + +The diagram shown sub-chunk boundaries within the chunk table. diff --git a/doc/ingest/api/concepts/transactions.rst b/doc/ingest/api/concepts/transactions.rst new file mode 100644 index 0000000000..b736824e3b --- /dev/null +++ b/doc/ingest/api/concepts/transactions.rst @@ -0,0 +1,257 @@ +.. _ingest-api-concepts-transactions: + +Transactions +============ + +The distributed transaction mechanism is one of the key technologies that was +implemented in the Qserv Ingest system to allow for the incremental updates of the overall state of the data and metadata +while ensuring the consistency of the ingested catalogs. Transactions also play an important role in allowing +the high-performance ingest activities to be performed in a distributed environment. Transactions if used correct may +significantly increase the level of parallelism of the ingest workflows. Transactions are not visible to end users. + +Transactions are open in a scope of a database. It's a responsibility of the workflows to manage transactions as needed +for the ingest activities uisng the following REST services: + +- :ref:`ingest-trans-management` (REST) + +Isolation and parallelism +------------------------- + +The first role of the transaction is to provide the isolation of the ingest activities. The transactions allow for the +parallel ingest of the data into the tables located at the many workers of the Qserv cluster, and into the same table +located at the same worker. The transactions are started and commited independently of each other. The transactions +are not visible to the users and are not used for the user queries. + +To understand why the transactions help to increase the level of parallelism of the ingest activities, read +the last section on this page: + +- :ref:`ingest-api-concepts-transactions-impl` (CONCEPTS) + +Row tagging +----------- + +The second role of the transactions is to implement the tagging mechanism for the ingested data. All rows +ingested into to the data tables and the *director* index tables are tagged with the transaction identifiers +that is unique for each transaction. Hence, all contribution requests made into the tables via this API are +associated with a specific identifier. The identifiers are usually sent in the service request and response objects +in an attribute ``transaction_id``. As an example of the attribute, see a description of the following REST service: + +- :ref:`ingest-worker-contrib-by-ref` (REST) + +An effect of such tagging can be seen as a special column called ``qserv_trans_id`` that is automatically added by +the Ingest system into the table schema of each table. In the current implementation of the system, this is the very +first column of the table. The column is of the ``UNSIGNED INT`` type and is not nullable. The column is visible +to Qserv users and is queriable. Here is an illustration of a query and the corresponding result set illustrating the concept: + +.. code-block:: sql + + SELECT `qserv_trans_id`, `objectId`,`chunkId` + FROM `dp02_dc2_catalogs`.`Object` + WHERE `qserv_trans_id` IN (860, 861); + + +----------------+---------------------+---------+ + | qserv_trans_id | objectId | chunkId | + +----------------+---------------------+---------+ + | 860 | 1249546586455828954 | 57871 | + | 860 | 1249546586455828968 | 57871 | + . . . . + | 861 | 1252546054176403713 | 57891 | + | 861 | 1252546054176403722 | 57891 | + +----------------+---------------------+---------+ + +.. note:: + + The database administrator can decide to drop the column from the table schema if there is a need to save the space + in the table. The column is not used by Qserv for any other purposes than the ingest activities. And once the ingest + is completed, the column is not needed anymore except for Q&A-ing the data, bookeeping and data provenance. + +Checkpointing +------------- + +Transactions also provide the checkpointing mechanism that allows rolling back to a prior consistent state of the affected tables +should any problem occur during the ingest activities. Transactions may spans across many workers and tables located +at the workers. It's up to the workflow to decide what contrubutions to ingest and in what order to ingest those in +a scope of each transaction. + +The following diagram illustrates the concept of the transactions in Qserv. There are 3 transactions that are started and +commited sequentially (in the real life scenarios the transactions can be and should be started and commited in parallel, +and indepedently of each other). Data are ingested into two separate table located at 2 workers. The diagram also shows +a failure to ingest the data into table ``Table-A`` at ``Worker-X`` in a scope of ``Transaction-2``: + +.. image:: /_static/ingest-transactions-failed.png + :target: ../../../_images/ingest-transactions-failed.png + :alt: Failed Transaction Contribution + +At this point the table ``Table-A`` at ``Worker-X`` is found in an inconsistent state. The workflow can decide to roll back +the failed transaction and to re-try the ingest activities in a scope of the new transaction. The rollback is performed by +the worker ingest service: + +- :ref:`ingest-trans-management-end` (REST) + +Also read the following document to learn more about the transaction abort: + +- :ref:`ingest-api-advanced-transactions-abort` (ADVANCED) + +Removing the failed transaction would result in the following state of the tables, which is *clean* and consistent: + +.. image:: /_static/ingest-transactions-aborted.png + :target: ../../../_images/ingest-transactions-aborted.png + :alt: Aborted Transaction + +After that, the workflow can re-try **ALL** ingest activities that were meant to be done in a scope of the previously +failed transaction by starting another transaction. If the ingest activities are successful, the tables will be in the +consistent state: + +.. image:: /_static/ingest-transactions-resolved.png + :target: ../../../_images/ingest-transactions-resolved.png + :alt: Another Transaction + + +Transaction status and state +---------------------------- + +Each transaction is in a well-defined *state* at each moment of time. The state is a part of the broader collection +if the transaction attributes called the transaction *status*. All of this can be obtained by calling services +documented in the following section: + +- :ref:`ingest-trans-management-status` (REST) +- :ref:`ingest-trans-management-states` (REST) + +The services provide a flexible filtering mechanism for finding the transactions of interest in various scopes and states +and reporting the information at different levels of details as needed by the workflows or other applications. + +These are just a few typical applications for this information in a context of the workflows: + +- *Dynamic* transaction management (versus the *static* management where all transactions would be started at once): + + - Starting the limited number of transactions at the beginning of the ingest + - Monitoring the progress and performance of the transactions + - Committing transactions where all table contributes were successfully ingested + - Starting new transactions to load more contributions to meet the performance goals + +- Locating the failed transactions and re-trying the ingest activities in a scope of the new transactions. +- Locating failed table contribution requests that were made in a scope of a transaction to see if it's possible + to retry the contributions w/o aborting the transaction. +- Building a Web dashboard. + +Contexts +-------- + +When starting (or finishing a transaction) using the corresponding services (see below) a workflow may optionally +attach an piece of arbitrary workflow-defined information (the JSON object) to the transaction. The object is called +the *context*. It will be stored within the Replication/Ingest system's database and be associated with the transaction. +The object could be as large as ``16 MB``. In effect, the context is a part of the transaction's persistent state. + +The initial version of the context object is passed along the transaction start request in the attribute ``context``: + +- :ref:`ingest-trans-management-start` (REST) + +The context object may also be updated when aborting or committing a transaction by: + +- :ref:`ingest-trans-management-end` (REST) + +Contexts are also retrieved by the status retrieval services: + +- :ref:`ingest-trans-management-status` (REST) + +The workflow may use the contexts for the following reasons: + +- Store the information on the input contributions made in a scope of a transaction to be used later for the recovery + from the failures. The information may include locations of the input files, as well as any other information + allowing to retry the contributions. Making the workflows to depend on the contexts may simplify the implementation + of the workflows by allowing to avoid the need to store the information in the external databases or files. + Altogether, the contexts may improve robustness of the workflows. +- Store the information for the purpose of internal bookkeeping that would be independent of the user workflow's + infrastructure or environment. +- Store the additional information to be used as a source of metadata for data provenance systems. + +Obviously, the workflow implementation may have its own mechanism for that, and it probably should. However, attaching +the metadata to transactions in the persistent state of the system along with the transactions has a few important benefits. +In particular, it guarantees consistency between transactions and contexts. Secondly, it provides the precise timing for +the ingest operations (the start and finish times are measured by the Ingest system at the right moments). +Thirdly, the information may be seen from the general-purpose Web Dashboard application of Qserv and could also be used +by the database support teams for building various metrics on the performance of the Qserv Ingest system. + + +.. _ingest-api-concepts-transactions-impl: + +Implementation Details +---------------------- + +The Qserv transactions are quite different from the ones in the typical RDBMS implementations. Firstly, they are not designed +as an an isolation mechanis for executing user queries, and the are not visible to Qserv users. In Qserv, tables that are being +ingested are not seen or queriable by the users anyway. The main purpose of the transactions in Qserv is to allow for +the incremental updates of the distributed state of data in Qserv across many (potentially - hundreds of) workers. +Each worker runs its own instance of the MySQL/MariaDB server which is not aware of the of the others. Some might say that +transactions are associated with *vertical slices* of rows in the tables that are located at the workers. + +The second technical problem to be addressed by the transactions is a lack of the transaction support in the MyISAM table +engine that is used in Qserv for the data tables. The MyISAM engine is used in Qserv due to it ssimplicity and high performance. +Unfortunately, failuires while ingesting data into the MyISAM tables can leave the table in a corrupted state. The transactions +provide a mechanism allowing to roll back the tables to a consistent state in case of the failures. The current implementation +of the transactions in Qserv is based on the MySQL/MariaDB partitions: + +- https://mariadb.com/kb/en/partitioning-overview/ + + +.. warning:: + + When the catalog is being published, the partitioned MyISAM tables are converted to the regular format. + This operation is performed by the Qserv Ingest system. + The conversion is a time-consuming operation and may take a long time to complete for + a single table. An observed performance of the operation per table is on a scale of ``20 MB/s`` to ``50 MB/s``. + However, a typical catalog will have thousands of such chunk tables which would be processed in parallel + at all workers of the Qserv cluster. The resulting performance of the conversion would be on a scale of + many ``GB/s``, and the operation would be completed in a reasonable time. + + - A definition of the *reasonable time* is given rather loosely here. An overall idea is that + such conversion should be on the same scale (smaller) as the table ingest *per se*. A similar + philosophy is applied to other data management operations in Qserv besides the ingest. + +From a prospective of the workflows, these are the most important limitations of the transactions: + +- Transaction identifiers are the 32-bit unsigned integer numbers. The maximum number of the transactions that can be + started in the system is 2^32 - 1 = 4,294,967,295. The transactions are not re-used, so the number of the transactions + that can be started in the system is limited by the number of the unique transaction identifiers that can be generated + by the system. + +- The transaction with the identifier ``0`` is reserved for the system for the so called *default* transaction. + The workflows can't ingest any contributions in a context of that transaction, or manage this special transaction. + +- MySQL tables only allow up to ``8,000`` partitions per table. This is a limitation of the MySQL/MariaDB partitioning mechanism. + And there is a certain overhead in MySQL for each partition. Hence, it's not recommended to start more than ``1,000`` transactions + during the ingest. + +Transaction numbers directly map to the partition identifiers of the MySQL/MariaDB partitioned tables. Here is an example +of a few chunk tables of a catalog that is still being ingested: + +.. code-block:: bash + + -rw-rw----+ 1 rubinqsv gu 4868 Sep 10 20:48 gaia_source_1012.frm + -rw-rw----+ 1 rubinqsv gu 48 Sep 10 20:48 gaia_source_1012.par + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1012#P#p0.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1012#P#p0.MYI + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1012#P#p1623.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1012#P#p1623.MYI + -rw-rw----+ 1 rubinqsv gu 31000308 Sep 10 20:48 gaia_source_1012#P#p1628.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 11 19:49 gaia_source_1012#P#p1628.MYI + -rw-rw----+ 1 rubinqsv gu 4868 Sep 10 20:48 gaia_source_1020.frm + -rw-rw----+ 1 rubinqsv gu 48 Sep 10 20:48 gaia_source_1020.par + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1020#P#p0.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1020#P#p0.MYI + -rw-rw----+ 1 rubinqsv gu 51622084 Sep 10 20:48 gaia_source_1020#P#p1624.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 11 19:49 gaia_source_1020#P#p1624.MYI + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1020#P#p1630.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1020#P#p1630.MYI + -rw-rw----+ 1 rubinqsv gu 4868 Sep 10 20:47 gaia_source_1028.frm + -rw-rw----+ 1 rubinqsv gu 48 Sep 10 20:47 gaia_source_1028.par + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1028#P#p0.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1028#P#p0.MYI + -rw-rw----+ 1 rubinqsv gu 739825104 Sep 10 20:48 gaia_source_1028#P#p1625.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 11 19:49 gaia_source_1028#P#p1625.MYI + -rw-rw----+ 1 rubinqsv gu 0 Sep 10 20:46 gaia_source_1028#P#p1629.MYD + -rw-rw----+ 1 rubinqsv gu 2048 Sep 10 20:46 gaia_source_1028#P#p1629.MYI + +This snapshot was taken by looking at the MariaDB data directory at one of the Qserv workers. Note that the tables +are partitioned by the transaction numbers, where the transaction identifiers are the numbers after the ``#P#`` in +the file names. diff --git a/doc/ingest/api/index.rst b/doc/ingest/api/index.rst new file mode 100644 index 0000000000..99353ca24b --- /dev/null +++ b/doc/ingest/api/index.rst @@ -0,0 +1,21 @@ + +.. note:: + + Information in this guide corresponds to the version **38** of the Qserv REST API. Keep in mind + that each implementation of the API has a specific version. The version number will change + if any changes to the implementation or the API that might affect users will be made. + The current document will be kept updated to reflect the latest version of the API. + +##################################### +The Ingest Workflow Developer's Guide +##################################### + +.. toctree:: + :maxdepth: 4 + + introduction + concepts/index + simple/index + advanced/index + post-ingest/index + reference/index diff --git a/doc/ingest/api/introduction.rst b/doc/ingest/api/introduction.rst new file mode 100644 index 0000000000..9919e399f0 --- /dev/null +++ b/doc/ingest/api/introduction.rst @@ -0,0 +1,90 @@ +Introduction +============ + +This document presents an API that is available in Qserv for constructing the data ingest applications (also mentioned +in the document as *ingest workflows*). The API is designed to provide a high-performance and reliable mechanism for +ingesting large quantities of data where the high performance or reliability of the ingests is at stake. +The document is intended to be a practical guide for the developers who are building those applications. +It provides a high-level overview of the API, its main components, and the typical workflows that can be built using the API. + +At the very high level, the Qserv Ingest system is comprised of: + +- The REST server that is integrated into the Master Replication Controller. The server provides a collection + of services for managing metadata and states of the new catalogs to be ingested. The server also coordinates + its own operations with Qserv itself and the Qserv Replication System to prevent interferences with those + and minimize failures during catalog ingest activities. +- The Worker Ingest REST service run at each Qserv worker node alongside the Qserv worker itself and the worker MariaDB server. + The role of these services is to actually ingest the client's data into the corresponding MySQL tables. + The services would also do an additional (albeit, minimal) preprocessing and data transformation (where or when needed) + before ingesting the input data into MySQL. Each worker server also includes its own REST server for processing + the "by reference" ingest requests as well as various metadata requests in the scope of the workers. + +Implementation-wise, the Ingest System heavily relies on services and functions of the Replication System including +the Replication System's Controller Framework, various (including the Configuration) services, and the worker-side +server infrastructure of the Replication System. + +Client workflows interact with the system's services via open interfaces (based on the HTTP protocol, REST services, +JSON data format, etc.) and use ready-to-use tools to fulfill their goals of ingesting catalogs. + +Here is a brief summary of the Qserv Ingest System's features: + +- It introduces the well-defined states and semantics into the ingest process. With that, a process of ingesting a new catalog + now has to go through a sequence of specific steps maintaining a progressive state of the catalog within Qserv + while it's being ingested. The state transitions and the corresponding enforcements made by the system would + always ensure that the catalog would be in a consistent state during each step of the process. + Altogether, this model increases the robustness of the process, and it also makes it more efficient. + +- To facilitate and implement the above-mentioned state transitions the new system introduces a distributed + *tagging* and *checkpointing* mechanism called *super-transactions*. The transactions allow for incremental + updates of the overall state of the data and metadata while allowing to safely roll back to a prior consistent + state should any problem occur during data loading within such transactions. + + - The data tagging capability of the transactions can be also used by the ingest workflows and by + the Qserv administrators for bookkeeping of the ingest activities and for the quality control of + the ingested catalogs. + +- In its very foundation, the system has been designed for constructing high-performance and parallel ingest + workflows w/o compromising the consistency of the ingested catalogs. + +- For the actual data loading, the system offers plenty of options, inluding pushing data into Qserv directly + via a proprietary binary protocol using :ref:`ingest-tools-qserv-replica-file`, :ref:`ingest-worker-contrib-by-val` + in the HTTP request body, or :ref:`ingest-worker-contrib-by-ref`. In the latter case, the input data (so called table + *contributions*) will be pulled by the worker services from remote locations as instructed by the ingest workflows. + The presently supported sources include the object stores (via the HTTP/HTTPS protocols) and the locally mounted + distributed filesystems (via the POSIX protocol). + + - The ongoing work on the system includes the development of the support for the ingesting contributions + from the S3 object stores. + +- The data loading services also collect various information on the ongoing status of the ingest activities, + abnormal conditions that may occur during reading, interpreting, or loading the data into Qserv, as well + as the metadata for the data that is loaded. The information is retained within the persistent + state of the Replication/Ingest System for the monitoring and debugging purposes. A feedback is provided + to the workflows on various aspects of the ingest activities. The feedback is useful for the workflows to adjust their + behavior and to ensure the quality of the data being ingested. + + - To get further info on this subject, see sections :ref:`ingest-general-error-reporting` and + :ref:`ingest-worker-contrib-descriptor-warnings`. + In addition, the API provides REST services for obtaining metadata on the state of catalogs, tables, distributed + transactions, contribution requests, the progress of the requested operations, etc. + +**What the Ingest System does NOT do**: + +- As per its current implementation (which may change in the future) it does not automatically partition + input files. This task is expected to be a responsibility of the ingest workflows. The only data format + is is presently supported for the table payload are ``CSV`` and ``JSON`` (primarily for ingesting + user-generated data products as explained in :ref:`http-frontend-ingest`). + +- It does not (with an exception of adding an extra leading column ``qserv_trans_id`` required by + the implementation of the previously mentioned *super-transactions*) pre-process the input ``CSV`` + payload sent to the Ingest Data Servers by the workflows for loading into tables. + It's up to the workflows to sanitize the input data and to make them ready to be ingested into Qserv. + +More information on the requirements and the low-level technical details of its implementation (unless it's +needed for the purposes of this document's goals) can be found elsewhere. + +It's recommended to read the document sequentially. Most ideas presented in the document are introduced in +a section :ref:`ingest-api-concepts` and illustrated with a simple practical example in :ref:`ingest-api-simple`. +The section is followed by a few more sections covering :ref:`ingest-api-advanced` and :ref:`ingest-api-post-ingest`. +The :ref:`ingest-api-reference` section of the document provides complete descriptions of the REST services and tools +mentioned in the document. diff --git a/doc/ingest/api/post-ingest/index.rst b/doc/ingest/api/post-ingest/index.rst new file mode 100644 index 0000000000..c26fdd6857 --- /dev/null +++ b/doc/ingest/api/post-ingest/index.rst @@ -0,0 +1,12 @@ + +.. _ingest-api-post-ingest: + +================================= +Post-Ingest Data Management Tasks +================================= + +The following optional steps are performed after the data has been ingested: + +- :ref:`admin-row-counters` (ADMIN) +- :ref:`admin-data-table-index` (ADMIN) +- :ref:`admin-director-index` (ADMIN) diff --git a/doc/ingest/api/reference/index.rst b/doc/ingest/api/reference/index.rst new file mode 100644 index 0000000000..24a58e83d8 --- /dev/null +++ b/doc/ingest/api/reference/index.rst @@ -0,0 +1,12 @@ + +.. _ingest-api-reference: + +###################### +Ingest API Reference +###################### + +.. toctree:: + :maxdepth: 4 + + rest/index + tools diff --git a/doc/ingest/api/reference/rest/controller/config.rst b/doc/ingest/api/reference/rest/controller/config.rst new file mode 100644 index 0000000000..65f47786a6 --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/config.rst @@ -0,0 +1,426 @@ +.. _ingest-config: + +Configuring parameters of the ingests +===================================== + +.. _ingest-config-set: + +Setting configuration parameters +-------------------------------- + +Parameters are set for a database (regardless of the *published* status) using the following service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``PUT`` + - ``/ingest/config`` + +The request object has the following schema: + +.. code-block:: + + { "database" : , + "SSL_VERIFYHOST" : , + "SSL_VERIFYPEER" : , + "CAPATH" : , + "CAINFO" : , + "CAINFO_VAL" : , + "PROXY_SSL_VERIFYHOST" : , + "PROXY_SSL_VERIFYPEER" : , + "PROXY_CAPATH" : , + "PROXY_CAINFO" : , + "PROXY_CAINFO_VAL" : , + "CURLOPT_PROXY" : , + "CURLOPT_NOPROXY" : , + "CURLOPT_HTTPPROXYTUNNEL" : , + "CONNECTTIMEOUT" : , + "TIMEOUT" : , + "LOW_SPEED_LIMIT" : , + "LOW_SPEED_TIME" : , + "ASYNC_PROC_LIMIT" : + } + +Where: + +``database`` : *string* : **required** + The required name of a database affected by the operation. + +``SSL_VERIFYHOST`` : *number* = ``2`` + The optional flag that tells the system to verify the host of the peer. If the value is set + to ``0`` the system will not check the host name against the certificate. Any other value would tell the system + to perform the check. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html. + +``SSL_VERIFYPEER`` : *number* = ``1`` + The optional flag that tells the system to verify the peer's certificate. If the value is set + to ``0`` the system will not check the certificate. Any other value would tell the system to perform the check. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html. + +``CAPATH`` : *string* = ``/etc/ssl/certs`` + The optional path to a directory holding multiple CA certificates. The system will use the certificates + in the directory to verify the peer's certificate. If the value is set to an empty string the system will not use + the certificates. + + Putting the empty string as a value of the parameter will effectively turn this option off as if it has never been + configured for the database. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_CAPATH.html. + +``CAINFO`` : *string* = ``/etc/ssl/certs/ca-certificates.crt`` + The optional path to a file holding a bundle of CA certificates. The system will use the certificates + in the file to verify the peer's certificate. If the value is set to an empty string the system will not use + the certificates. + + Putting the empty string as a value of the parameter will effectively turn this option off as if it has never been + configured for the database. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_CAINFO.html. + +``CAINFO_VAL`` : *string* = ``""`` + The optional value of a certificate bundle for a peer. This parameter is used in those cases when it's + impossible to inject the bundle directly into the Ingest workers' environments. If a non-empty value of the parameter + is provided then ingest servers will use it instead of the one mentioned (if any) in the above-described + attribute ``CAINFO``. + + **Attention**: Values of the attribute are the actual certificates, not file paths like in the case of ``CAINFO``. + +``PROXY_SSL_VERIFYHOST`` : *number* = ``2`` + The optional flag that tells the system to verify the host of the proxy. If the value is set + to ``0`` the system will not check the host name against the certificate. Any other value would tell the system + to perform the check. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_PROXY_SSL_VERIFYHOST.html. + +``PROXY_SSL_VERIFYPEER`` : *number* = ``1`` + The optional flag that tells the system to verify the peer's certificate. If the value is set + to ``0`` the system will not check the certificate. Any other value would tell the system to perform the check. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_PROXY_SSL_VERIFYPEER.html. + +``PROXY_CAPATH`` : *string* = ``""`` + The optional path to a directory holding multiple CA certificates. The system will use the certificates + in the directory to verify the peer's certificate. If the value is set to an empty string the system will not use + the certificates. + + Putting the empty string as a value of the parameter will effectively turn this option off as if it has never been + configured for the database. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_PROXY_CAPATH.html. + +``PROXY_CAINFO`` : *string* = ``""`` + The optional path to a file holding a bundle of CA certificates. The system will use the certificates + in the file to verify the peer's certificate. If the value is set to an empty string the system will not use + the certificates. + + Putting the empty string as a value of the parameter will effectively turn this option off as if it has never been + configured for the database. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_PROXY_CAINFO.html. + +``PROXY_CAINFO_VAL`` : *string* = ``""`` + The optional value of a certificate bundle for a proxy. This parameter is used in those cases when it's + impossible to inject the bundle directly into the Ingest workers' environments. If a non-empty value of the parameter + is provided then ingest servers will use it instead of the one mentioned (if any) in the above-described + attribute ``PROXY_CAINFO``. + + **Attention**: Values of the attribute are the actual certificates, not file paths like in the case of ``PROXY_CAINFO``. + +``CURLOPT_PROXY`` : *string* = ``""`` + Set the optional proxy to use for the upcoming request. The parameter should be a null-terminated string + holding the host name or dotted numerical IP address. A numerical IPv6 address must be written within ``[brackets]``. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_PROXY.html. + +``CURLOPT_NOPROXY`` : *string* = ``""`` + The optional string consists of a comma-separated list of host names that do not require a proxy + to get reached, even if one is specified. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_NOPROXY.html. + +``CURLOPT_HTTPPROXYTUNNEL`` : *number* = ``0`` + Set the optional tunnel parameter to ``1`` to tunnel all operations through the HTTP proxy + (set with ``CURLOPT_PROXY``). + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html. + +``CONNECTTIMEOUT`` : *number* = ``0`` + The optional maximum time in seconds that the system will wait for a connection to be established. + The default value means that the system will wait indefinitely. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_CONNECTTIMEOUT.html + +``TIMEOUT`` : *number* = ``0`` + The optional maximum time in seconds that the system will wait for a response from the server. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_TIMEOUT.html + +``LOW_SPEED_LIMIT`` : *number* = ``0`` + The optional transfer speed in bytes per second that the system considers too slow and will abort the transfer. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_LIMIT.html + +``LOW_SPEED_TIME`` : *number* = ``0`` + The optional time in seconds that the system will wait for the transfer speed to be above the limit + set by ``LOW_SPEED_LIMIT``. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html + +``ASYNC_PROC_LIMIT`` : *number* = ``0`` + The optional maximum concurrency limit for the number of contributions to be processed in a scope of + the database. The actual number of parallel requests may be further lowered by the hard limit specified by + the Replication System worker's configuration parameter (``worker``, ``num-async-loader-processing-threads``). + The parameter can be adjusted in real time as needed. It gets into effect immediately. Putting ``0`` as a value of + the parameter will effectively turn this option off as if it has never been configured for the database. + + This attribute directly maps to https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html + + **Note**: The parameter is available as of API version ``14``. + +If a request is successfully finished it returns the standard JSON object w/o any additional data but +the standard completion status. + +.. _ingest-config-get: + +Retrieving configuration parameters +----------------------------------- + +.. warning:: + As of version ``14`` of the API, the name of the database is required to be passed in the request's query instead of + passing it in the JSON body. The older implementation was wrong. + + +.. list-table:: + :widths: 10 25 65 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/ingest/config`` + - ``database=`` + +Where the mandatory query parameter ``database`` specifies the name of a database affected by the operation. + +If the operation is successfully finished it returns an extended JSON object that has the following schema (in addition +to the standard status and error reporting attributes): + +.. code-block:: + + { "database" : , + "SSL_VERIFYHOST" : , + "SSL_VERIFYPEER" : , + "CAPATH" : , + "CAINFO" : , + "CAINFO_VAL" : , + "PROXY_SSL_VERIFYHOST" : , + "PROXY_SSL_VERIFYPEER" : , + "PROXY_CAPATH" : , + "PROXY_CAINFO" : , + "PROXY_CAINFO_VAL" : , + "CURLOPT_PROXY" : , + "CURLOPT_NOPROXY" : , + "CURLOPT_HTTPPROXYTUNNEL" : , + "CONNECTTIMEOUT" : , + "TIMEOUT" : , + "LOW_SPEED_LIMIT" : , + "LOW_SPEED_TIME" : , + "ASYNC_PROC_LIMIT" : + } + +The attributes of the response object are the same as the ones described in the section :ref:`ingest-config-set`. + +.. _ingest-config-global-workers: + +Global configuration parameters of workers +------------------------------------------ + +.. note:: + This is the same service that was described in: + + - :ref:`ingest-db-table-management-config` (REST) + + The response object of the service also returns the information on the workers. + +There are two sectons related to workers in the response object. The first section ``config.general.worker`` +includes the general parameters of the ingest services. Values of the parameters are the same for all +workers. The second section ``config.workers`` has the information on the individual workers. + +The general information on all workers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The schema of the relevant section of the respionse object is illustrated by the following example: + +.. code-block:: json + + { "config": { + "general" : { + "worker" : { + "num-loader-processing-threads" : 64, + "num-http-loader-processing-threads" : 8, + "num-async-loader-processing-threads" : 8, + + "ingest-charset-name" : "latin1", + + "ingest-max-retries" : 10, + "ingest-num-retries" : 1, + + "loader-max-warnings" : 64, + + "async-loader-auto-resume" : 1, + "async-loader-cleanup-on-resume" : 1, + }, + } + } + } + +Where: + +``config.general.worker`` : *object* + A collection of the general parameters of the worker ingest service. + +``num-loader-processing-threads`` : *number* + The number of ingest request processing threads in the service that supports the proprietary + binary protocol. + +``num-http-loader-processing-threads`` : *number* + The number of ingest request processing threads in the HTTP-based ingest service. Note that + the service is used for processing *synchronous* contribution requess and for submitting + the *asynchronous* requests to the service. + +``num-async-loader-processing-threads`` : *number* + The number of ingest request processing threads in a thread pool that processes + the *asynchronous* contribution requests. + +``ingest-charset-name`` : *string* + The name of a character set for parsing the payload of the contributions. + +``ingest-max-retries`` : *number* + The maximum number of the automated retries of failed contribution attempts + in cases when such retries are still possible. The parameter represents the *hard* + limit for the number of retries regardless of what's specified in the related + parameter ``ingest-num-retries`` or in the contributions requests made by the workflows. + The primary purpose of the parameter is to prevent accidental overloading + of the ingest system should a very large number of retries accidentally specified + by the ingest workflows for individual contributions. Setting a value of the parameter + to ``0`` will unconditionally disable any retries. + +``ingest-num-retries`` : *number* + The default number of the automated retries of failed contribution attempts + in cases when such retries are still possible. The limit can be changed for + individual contributions. Note that the effective number of retries specified + by this parameter or the one set in the contribution requests can not + exceed the *hard* limit set in the related parameter ``ingest-max-retries``. + Setting a value of the parameter to 0 will disable automatic retries (unless they are + explicitly enabled or requested by the ingest workflows for individual contributions). + +``loader-max-warnings`` : *number* + The maximum number of warnings to retain after executing ``LOAD DATA [LOCAL] INFILE`` + when ingesting contributions into worker MySQL database. The warnings (if any) will be recorded in + the persisent state of the Replication/Ingest system and returned to the ingest workflow upon request. + +``async-loader-auto-resume`` : *number* + The flag controlling the behavior of the worker's *asynchronous* ingest service after + (the deliberate or accidental) restarts. If the value of the parameter is not ``0`` then the service + will resume processing incomplete (queued or on-going) requests. Setting a value of the parameter + to ``0`` will result in the unconditional failing of all incomplete contribution requests existed prior + the restart. + + .. warning:: + + Requests failed in the last (loading) stage can't be resumed, and they will require aborting + the corresponding transaction. If the automaticu resume is enabled rhese request will be automatically + closed and marked as failed. + +``async-loader-cleanup-on-resume`` : *number* + The flag controlling the behavior of worker's *asynchronous* ingest service after + restarting the service. If the value of the parameter is not ``0`` the service will try to clean + up the temporary files that might be left on disk for incomplete (queued or ongoing) requests. + The option may be disabled to allow debugging the service. + +Worker-specific information +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The schema of the relevant section of the respionse object is illustrated by the following example: + +.. code-block:: json + + { "config": { + "workers" : [ + { "name" : "db02", + "is-enabled" : 1, + "is-read-only" : 0, + + "loader-host" : { + "addr" : "172.24.49.52", + "name" : "sdfqserv002.sdf.slac.stanford.edu" + }, + "loader-port" : 25002, + "loader-tmp-dir" : "/qserv/data/ingest", + + "http-loader-host" : { + "name" : "sdfqserv002.sdf.slac.stanford.edu", + "addr" : "172.24.49.52" + }, + "http-loader-port" : 25004, + "http-loader-tmp-dir" : "/qserv/data/ingest", + }, + ] + } + } + +Where: + +``config.workers`` : *array* + A collection of worker nodes, where each object represents a worker node. + +``name`` : *string* + The unique identifier of a worker node. + +``is-enabled`` : *number* + The flag that tells if the worker node is enabled. If the value is set to ``0`` the worker node is disabled. + Workers which are not enables do not participate in the ingest activities. + +``is-read-only`` : *number* + The flag that tells if the worker node is read-only. If the value is set to ``0`` the worker node is read-write. + Workers which are in the read-only statte do not participate in the ingest activities. + +**Parameters of the ingest service that supports the proprietary binary protocol**: + +``loader-host`` : *object* + The object with the information about the loader host. + + - ``addr`` : *string* + The IP address of the lder host. + + - ``name`` : *string* + The FQDN (fully-qualified domain name) of the host. + +``loader-port`` : *number* + The port number of the ingest service. + +``loader-tmp-dir`` : *string* + The path to the temporary directory on the loader host that is used by the ingest service + as a staging area for the contributions. + +**Parameters of the HTTP-based ingest service**: + +``http-loader-host`` : *object* + The object with the information about the loader host. + + - ``addr`` : *string* + The IP address of the lder host. + + - ``name`` : *string* + The FQDN (fully-qualified domain name) of the host. + +``http-loader-port`` : *number* + The port number of the ingest service. + +``http-loader-tmp-dir`` : *string* + The path to the temporary directory on the loader host that is used by the ingest service + as a staging area for the contributions. diff --git a/doc/ingest/api/reference/rest/controller/db-table-management.rst b/doc/ingest/api/reference/rest/controller/db-table-management.rst new file mode 100644 index 0000000000..f7624f8eb8 --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/db-table-management.rst @@ -0,0 +1,559 @@ +Database and table management +============================= + +.. _ingest-db-table-management-config: + +Finding existing databases and database families +------------------------------------------------ + +The following service pulls all configuration information of of the Replication/Ingest System, including info +on the known database families, databases and tables: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/replication/config`` + +Upon successful (see :ref:`ingest-general-error-reporting`) completion of the request, the service will return an object +that has the following schema (of which only the database and database family-related fields are shown): + +.. code-block:: json + + { + "config": { + "database_families" : [ + { + "overlap" : 0.01667, + "min_replication_level" : 3, + "num_sub_stripes" : 3, + "name" : "production", + "num_stripes" : 340 + } + ], + "databases" : [ + { + "database" : "dp01_dc2_catalogs_02", + "create_time" : 0, + "is_published" : 1, + "publish_time" : 1662688661000, + "family_name" : "production", + "tables" : [ + { + "ang_sep" : 0, + "is_director" : 1, + "latitude_key" : "coord_dec", + "create_time" : 1662774817703, + "unique_primary_key" : 1, + "flag" : "", + "name" : "Source", + "director_database_name" : "", + "is_ref_match" : 0, + "is_partitioned" : 1, + "longitude_key" : "coord_ra", + "database" : "dp02_dc2_catalogs", + "director_table" : "", + "director_key2" : "", + "director_database_name2" : "", + "director_key" : "sourceId", + "director_table2" : "", + "director_table_name2" : "", + "is_published" : 1, + "director_table_name" : "", + "publish_time" : 1663033002753, + "columns" : [ + { + "name" : "qserv_trans_id", + "type" : "INT NOT NULL" + }, + { + "type" : "BIGINT NOT NULL", + "name" : "sourceId" + }, + { + "type" : "DOUBLE NOT NULL", + "name" : "coord_ra" + }, + { + "type" : "DOUBLE NOT NULL", + "name" : "coord_dec" + } + ] + } + ] + } + ] + } + } + +**Notes**: + +- The sample object was truncated for brevity. The actual number of families, databases, tables and columns were + much higher in the real response. +- The number of attributes varies depending on a particular table type. The example above shows + attributes for the table ``Source``. This table is *partitioned* and is a *director* (all *director*-type tables + are partitioned in Qserv). + + +.. _ingest-db-table-management-register-db: + +Registering databases +---------------------- + +Each database has to be registered in Qserv before one can create tables and ingest data. The following +service of the Replication Controller allows registering a database: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/database`` + +The service requires a JSON object of the following schema: + +.. code-block:: + + { + "database" : , + "num_stripes" : , + "num_sub_stripes" : , + "overlap" : , + "auto_build_secondary_index" : , + "local_load_secondary_index" : + } + +Where: + +``database`` : *string* + The required name of the database to be created. + +``num_stripes`` : *number* + The required number of stripes that was used when partitioning data of all tables to be ingested in a scope of the database. + +``num_sub_stripes`` : *number* + The required number of sub-stripes that was used when partitioning data of all tables to be ingested in a scope of the database. + +``overlap`` : *number* + The required overlap between the stripes. + +``auto_build_secondary_index`` : *number* = ``1`` + The flag that specifies the desired mode for building the *director* (used to be known as the *secondary*) + indexes of the director tables of the catalog. The flag controls the automatic building of the indexes, where: + + - ``1``: Build the index automatically during transaction commit time. + - ``0``: Do not build the index automatically during transaction commit time. In this case, it will be up to a workflow + to trigger the index building as a separated "post-ingest" action using the corresponding service: + + - :ref:`ingest-director-index-build` + + **Note**: Catalogs in Qserv may have more than one director table. This option applies to all such tables. + +.. warning:: + + - The service will return an error if the database with the same name already exists in the system. + - Values of attributes ``num_stripes``, ``num_sub_stripes`` and ``overlap`` are expected to match + the corresponding partitioning parameters used when partitioning all partitioned tables of the new database. + Note that the current implementation of the Qserv Ingest system will not validate contributions to the partitioned + tables to enforce this requirement. Only the structural correctness will be checked. It's up to a workflow + to ensure the data ingested into tables are correct. + - Building the *director* index during transaction commit time (for the relevant tables) may have a significant + impact on the performance of the transaction commit operation. The impact is proportional to the size of the + contributions made into the table during the transaction. This may orotolng the transaction commit time. + An alternative option is to build the indexes as a separated "post-ingest" action using the corresponding service: + + - :ref:`ingest-director-index-build` + +If the operation is successfully finished (see :ref:`ingest-general-error-reporting`) a JSON object returned by the service +will have the following attribute: + +.. code-block:: + + { + "database": { + ... + } + } + +The object containing the database configuration information has the same schema as it was explained earlier in section: + +- :ref:`ingest-db-table-management-config` + + +.. _ingest-db-table-management-register-table: + +Registering tables +------------------ + +All tables, regardless if they are *partitioned* or *regular* (fully replicated on all worker nodes), have to be registered +using the following Replication Controller's service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/table`` + +The service requires a JSON object of the following schema: + +Where a JSON object sent to the service with the request shall describe that table. This is a schema of the object for +the **partitioned** tables is presented below: + +.. code-block:: + + { + "database" : , + "table" : , + "is_partitioned" : , + "schema" : [ + { "name" : , + "type" : + }, + ... + ], + "director_table" : , + "director_key" : , + "director_table2" : , + "director_key2" : , + "latitude_key" : , + "longitude_key" : , + "flag" : , + "ang_sep" : , + "unique_primary_key" : + } + +A description of the *regular* tables has a fewer number of attributes (attributes that which are specific to the *partitioned* +tables are missing): + +.. code-block:: + + { + "database" : , + "table" : , + "is_partitioned" : , + "schema": [ + { + "name" : , + "type" : + }, + ... + ] + } + +Where the attributes are: + +``database`` : *string* + The required name of the existing database. + +``table`` : *string* + The required name of a table to be created. + +``is_partitioned`` : *number* + The required type of table. Allowed values: + + - ``1`` for partitioned tables (including any subtypes) + - ``0`` for the regular tables. + +``schema`` : *array* + The required definition of the table schema, where each entry of the array is an object with the following attributes: + + - ``name``: The name of the column. + - ``type``: The type of the column. The type must adhere to the MySQL requirements for column types. + +``director_table`` : *string* + The name of the corresponding first (or left) *director* table. The name is required to be not empty for + the *dependent* tables and it has to be empty for the *director* tables. This is the only way to differentiate between + two types of *partitioned* tables. + + **Note**: The *ref-match* tables are considered as the *dependent* tables since they have columns that are pointing + to the corresponding *director* tables. See attributes: ``director_key``, ``director_table2``, and ``director_key2``. + +``director_key`` : *string* + The required name of a column in a *partitioned* table. A role of the column depends on a subtype of + the table: + + - *director*: the primary key of the table + - *dependent*: the foreign key pointing to the corresponding column of the *director* table + +``director_table2`` : *string* + The name of the corresponding second (or right) *director* table. The non-empty value + name is required for the *ref-match* tables and it has to be empty for the *director* and *dependent* tables. + + **Note**: The very presence of this attribute in the input configuration would imply an intent to register + a "ref-match* table. In this case, non-empty values of the attributes ``director_key2`` , ``flag`` and ``ang_sep`` + will be required in order to succeed with the registration. + +``director_key2`` : *string* + The name of a column that is associated (AKA *foreign key*) with corresponding column of the second *director* table. + A value of this attribute is required for and it must not be empty when registering the *ref-match* tables. + It will be ignored for other table types. See a description of the attribute ``director_table2``. + +``latitude_key`` : *string* + The required name of a column in a *director* table represents latitude. It's optional for the *dependent* tables. + +``longitude_key`` : *string* + The required name of a column in a *director* table represents longitude. It's optional for the *dependent* tables. + +``flag`` : *string* + The name of the special column that is required to be present on the *ref-match* tables. + Values of the column are populated by the tool ``sph-partition-matches`` when partitioning the input files + of the *ref-match* tables. The data type of this column is usually: + + .. code-block:: sql + + INT UNSIGNED + +``ang_sep`` : *double* + The value of the angular separation for the matched objects that is used by Qserv to process queries which + involve the *ref-match* tables. The value is in radians. The value is required to be non-zero for the *ref-match* tables. + +``unique_primary_key`` : *number* = ``0`` + The optional flag allows to drop the uniqueness requirement for the *director* keys of the table. The parameter + is meant to be used for testing new table products, or for the *director* tables that won't have any dependants (child tables). + Allowed values: + + - ``0``: The primary key is not unique. + - ``1``: The primary key is unique. + +.. warning:: + + - The table schema does not include definitions of indexes. Those are managed separately after the table is published. + The index management interface is documented in a dedicated document + + - **TODO**: Managing indexes of MySQL tables at Qserv workers. + + - The service will return an error if the table with the same name already exists in the system, or + if the database didn' exist at a time when teh request was delivered to the service. + + - The service will return an error if the table schema is not correct. The schema will be checked for the correctness. + +.. note:: Requirements for the table schema: + + - The variable-length columns are not allowed in Qserv for the *director* and *ref-match* tables. All columns of these + tables must have fixed lengths. These are the variable length types: ``VARCHAR``, ``VARBINARY``, ``BLOB``, ``TEXT``, + ``GEOMETRY`` and ``JSON``. + + - The *partitioned* tables are required to have parameters ``director_key``, ``latitude_key`` and ``longitude_key``. + - The *director* tables are required to have non-empty column names in the parameters ``director_key``, ``latitude_key`` and ``longitude_key``. + - The *dependent* tables are required to have a non-empty column name specified in the parameter ``director_key``. + - The *dependent* tables are allowed to have empty values in the parameters ``latitude_key`` and ``longitude_key``. + + - For tables where the attributes ``latitude_key`` and ``longitude_key`` are provided (either because they are required + of if they are optional), values must be either both non-empty or empty. An attempt to specify only one of the attribute + or have a non-empty value in an attribute while the other one has it empty will result in an error. + + - All columns mentioned in attributes ``director_key``, ``director_key2``, ``flag``, ``latitude_key`` and ``longitude_key`` + must be present in the table schema. + + - Do not use quotes around the names or type specifications. + + - Do not start the columm names with teh reserved prefix ``qserv``. This prefix is reserved for the Qserv-specific columns. + +An example of the schema definition for the table ``Source``: + +.. code-block:: json + + [ + { + "name" : "sourceId" + "type" : "BIGINT NOT NULL", + }, + { + "name" : "coord_ra" + "type" : "DOUBLE NOT NULL", + }, + { + "name" : "coord_dec" + "type" : "DOUBLE NOT NULL", + } + ] + +If the operation is successfully finished (see :ref:`ingest-general-error-reporting`) a JSON object returned by the service +will have the following attribute: + +.. code-block:: + + { + "database": { + ... + } + } + +The object will contain the updated database configuration information that will also include the new table. +The object will have the same schema as it was explained earlier in section: + +- :ref:`ingest-db-table-management-config` + +**Notes on the table names**: + +- Generally, the names of the tables must adhere to the MySQL requirements for identifiers + as explained in: + + - https://dev.mysql.com/doc/refman/8.0/en/identifier-qualifiers.html + +- The names of identifiers (including tables) in Qserv are case-insensitive. This is not the general requirement + in MySQL, where the case sensitivity of identifiers is configurable one way or another. This requirement + is enforced by the configuration of MySQL in Qserv. + +- The length of the name should not exceed 64 characters as per: + + - https://dev.mysql.com/doc/refman/8.0/en/identifier-length.html + +- The names should **not** start with the prefix ``qserv``. This prefix is reserved for the Qserv-specific tables. + + +.. _ingest-db-table-management-publish-db: + +Publishing databases +-------------------- + +Databases are published (made visible to Qserv users) by calling this service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``PUT`` + - ``/ingest/database/:database`` + +The name of the database is provided as a parameter ``database`` of the resource path. There are a few optional +parameters to be sent in the JSON body of the request: + +.. code-block:: + + { + "consolidate_secondary_index" : , + "row_counters_deploy_at_qserv" : + } + +Where: + +``consolidate_secondary_index`` : *number* = ``0`` + The optional parameter that controls the final format of all the *director* index tables of the database. + Normally, the *director* indexes are MySQL-partitioned tables. If the value of this optional parameter is + not ``0`` then the Ingest System will consolidate the MySQL partitions and turn the tables into the monolitical form. + + .. warning:: + + Depending on the scale of the catalog (sizes of the affected tables), this operation may be quite lengthy (up to many hours). + Besides, based on the up to the date experience with using the MySQL-partitioned director indexes, the impact of the partitions + on the index's performance is rather negligible. So, it's safe to ignore this option in most but very special cases that are not + discussed by the document. + + One can find more info on the MySQL partitioning at: + + - https://dev.mysql.com/doc/refman/8.0/en/partitioning.html + +``row_counters_deploy_at_qserv`` : *number* = ``0`` + This optional flag that triggers scanning and deploying the row counters as explained at: + + - :ref:`admin-row-counters` (ADMIN) + - :ref:`ingest-row-counters-deploy` (REST) + + To trigger this operation the ingest workflow should provide a value that is not ``0``. In this case the row counters + collection service will be invoked with the following combination of parameters: + + .. list-table:: + :widths: 50 50 + :header-rows: 1 + + * - attr + - value + * - ``overlap_selector`` + - ``CHUNK_AND_OVERLAP`` + * - ``force_rescan`` + - ``1`` + * - ``row_counters_state_update_policy`` + - ``ENABLED`` + * - ``row_counters_deploy_at_qserv`` + - ``1`` + +.. warning:: + + The row counters deployment is a very resource-consuming operation. It may take a long time to complete + depending on the size of the catalog. This will also delay the catalog publiushing stage of an ingest compaign. + A better approach is to deploy the row counters as the "post-ingest" operation as explained in: + + - (**TODO** link) Deploying row counters as a post-ingest operation + +.. note:: + + The catalogs may be also unpublished to add more tables. The relevant REST service is documented in: + + - (**TODO** link) Un-publishing databases to allow adding more tables + + +.. _ingest-db-table-management-unpublish-db: + +Un-publishing databases to allow adding more tables +--------------------------------------------------- + +Unpublished databases as well as previously ingested tables will be still visible to users of Qserv. +The main purpose of this operation is to allow adding new tables to the existing catalogs. +The new tables won't be seen by users until the catalog is published back using the following REST service: + +- :ref:`ingest-db-table-management-publish-db` + +Databases are un-published by calling this service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``PUT`` + - ``/replication/config/database/:database`` + +The name of the database is provided in a parameter ``database`` of the resource. The only mandatory parameter +to be sent in the JSON body of the request is: + +``admin_auth_key`` : *string* + The administrator-level authentication key that is required to publish the database. + The key is used to prevent unauthorized access to the service. + + **Note**: The key is different from the one used to publish the database. The elevated privileges + are needed to reduce risks of disrupting user access to the previously loaded and published databases. + + +.. _ingest-db-table-management-delete: + +Deleting databases and tables +----------------------------- + +These services can be used for deleting non-*published* (the ones that are still ingested) as well as *published* databases, +or tables, including deleting all relevant persistent structures from Qserv: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - | ``/ingest/database/:database`` + | ``/ingest/table/:database/:table`` + +To delete a non-*published* database (or a table from such database) a client has to provide the normal level authentication +key ``auth_key`` in a request to the service: + +.. code-block:: + + { "auth_key" : + } + +The name of the databases affected by the operation is specified at the resource's path. + +Deleting databases (or tables from those databases) that have already been published requires a user to have +elevated administrator-level privileges. These privileges are associated with the authentication key ``admin_auth_key`` +to be sent with a request instead of ``auth_key``: + +.. code-block:: + + { "admin_auth_key" : + } + +Upon successful completion of the request (for both above-mentioned states of the database), the service will return the standard +response as explained in the section mentoned below. After that, the database (or the table, depending on a scope of a request) +name can be reused for further ingests if needed. + +- :ref:`ingest-general-error-reporting` + diff --git a/doc/ingest/api/reference/rest/controller/director-index.rst b/doc/ingest/api/reference/rest/controller/director-index.rst new file mode 100644 index 0000000000..33595bdefd --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/director-index.rst @@ -0,0 +1,139 @@ +Director Index Management +========================= + +.. _ingest-director-index-build: + +(Re-)building the Index +----------------------- + +.. note:: API version notes: + + - As of version ``21``, the service can no longer be used to (re-)build indexes of all *director* + tables of a catalog. It's now required to provide the name of the affected table in the parameter ``director_table``. + + - As of version ``22``, the service no longer support the option ``allow_for_published``. Any attempts to specify + the option will result in a warning reported by the service back to a client. The service will ignore the option. + +.. warning:: + Be advised that the amount of time needed to build an index of a large-scale catalog may be quite large. + The current implementation of the secondary index is based on MySQL's InnoDB table engine. The insert + time into this B-Tree table has logarithmic performance. It may take many hours to build catalogs of + billions of objects. In some earlier tests, the build time was 20 hours for a catalog of 20 billion objects. + + +The service of the **Master Replication Controller** builds or rebuilds (if needed) the *director* (used to be known as +the *secondary*) index table of a database. The target table must be *published* at the time of this operation. + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/index/secondary`` + +The request object has the following schema: + +.. code-block:: + + { "database" : , + "director_table" : , + "rebuild" : , + "local" : + } + +Where: + +``database`` : *string* + The required name of a database affected by the operation. + +``director_table`` : *string* + The required name of the *director* table for which the index is required to be (re-)built. + +``rebuild`` : *number* = ``0`` + The optional flag that allows recreating an existing index. If the value is set to ``0`` the service + will refuse to proceed with the request if the index already exists. Any other value would tell the service + to drop (if exists) the index table before re-creating and re-populating it with entries. + +``local`` : *number* = ``0`` + The optional flag that tells the service how to ingest data into the index table, where: + + - ``0``: Index contributions are required to be directly placed by the Replication/Ingest System at a location + that is directly accessible by the MySQL server hosting the index table. This could be either some local folder + of a host where the service is being run or a folder located at a network filesystem mounted on the host. + Once a file is in place, it would be ingested into the destination table using this protocol: + + .. code-block:: sql + + LOAD DATA INFILE ... + + **Note**: Be aware that this option may not be always possible (or cause complications) in Kubernetes-based + deployments of Qserv. + + - ``1`` (or any other numeric value): Index contributions would be ingested into the table using this protocol: + + .. code-block:: sql + + LOAD DATA LOCAL INFILE ... + + **Note**: Files would be first copied by MySQL at some temporary folder owned by the MySQL service before being + ingested into the table. This option has the following caveats: + + - The protocol must be enabled in the MySQL server configuration by setting a system variable: ``local_infile=1``. + - The temporary folder of the MySQL server is required to have sufficient space to temporarily accommodate index + contribution files before they'd be loaded into the table. In the worst-case scenario, there should be enough + space to accommodate all contributions of a given catalog. One could make a reasonable estimate for the latter + by knowing the total number of rows in the director table of the catalog, the size of the primary + key (typically the ``objectId`` column) of the table, as well as types of the ``chunk`` and ``subChunk`` + columns (which are usually the 32-bit integer numbers in Qserv). + - This ingest option would also affect (lower) the overall performance of the operation due to additional + data transfers required for copying file contributions from a location managed by the **Master Replication Controller** + to the temporary folder of the MySQL server. + +If the operation succeeded, the service will respond with the default JSON object which will not carry any additional +attributes on top of what's mandated in :ref:`ingest-general-error-reporting`. + +In case of errors encountered during an actual attempt to build the index was made, the object may have a non-trivial +value of the ``error_ext``. The object wil carry specific reasons for the failures. The schema of the object +is presented below: + +.. code-block:: + + "error_ext" : { +
: { + : { + : , + ... + }, + }, + ... + } + +Where: + +``table`` : *string* + The placeholder for the name of the director table. + +``worker`` : *string* + The placeholder for the name of the worker service that failed to build the index. + +``chunk`` : *number* + The placeholder for the chunk number. + +``error`` : *string* + The placeholder for the error message. + +Here is an example of how this object might look like: + +.. code-block:: + + "error_ext" : { + "object" : { + "qserv-db01" : { + 122 : "Failed to connect to the worker service", + 3456 : "error: Table 'tes96__Object' already exists, errno: 1050", + }, + "qserv-db23" : { + 123 : "Failed to connect to the worker service" + } + } + } diff --git a/doc/ingest/api/reference/rest/controller/index.rst b/doc/ingest/api/reference/rest/controller/index.rst new file mode 100644 index 0000000000..427d79ace2 --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/index.rst @@ -0,0 +1,14 @@ +############################# +Master Replication Controller +############################# + +.. toctree:: + :maxdepth: 4 + + config + db-table-management + trans-management + table-location + info + director-index + row-counters diff --git a/doc/ingest/api/reference/rest/controller/info.rst b/doc/ingest/api/reference/rest/controller/info.rst new file mode 100644 index 0000000000..7194a567fa --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/info.rst @@ -0,0 +1,137 @@ +Information services +==================== + +.. _ingest-info-chunks: + +Chunk disposition +----------------- + +.. warning:: + Do not use this service for the chunk placement decisions during catalog ingestion. The service is for + informational purposes only. + +The service of the **Master Replication Controller** return information about the chunk *replicas* in a scope of a given database: + +.. list-table:: + :widths: 10 15 75 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/ingest/chunks`` + - ``database=`` + +Where: + +``name`` : *string* + The required name of a database affected by the operation. + +The resulting object has the following schema: + +.. code-block:: + + { + "replica": [ + { + "chunk" : , + "worker": , + "table" : { + : { + "overlap_rows" : , + "overlap_data_size" : , + "overlap_index_size" : , + "rows" : , + "data_size" : , + "index_size" : + }, + ... + } + }, + ... + ] + } + +Where: + +``replica`` : *array* + A collection of chunk **replicas**, where each object representes a chunk replica. Replicas of a chunk + are essentially the same chunk, but placed on different workers. + +``chunk`` : *number* + The chunk number. + +``worker`` : *string* + The unique identifier of a worker where the chunk replica is located. + +``table`` : *object* + The object with the information about the chunk replica in the scope of + a particular *partitioned* table. + + **Attention**: The current implementation is incomplete. It will return ``0`` for all attributes + of the table object. + +``overlap_rows`` : *number* + The number of rows in the chunk's overlap table. + +``overlap_data_size`` : *number* + The number of bytes in the chunk's overlap table (measured by the size of the corresponding file). + +``overlap_index_size`` : *number* + The number of bytes in the index of the chunk's overlap table (measured by the size + of the corresponding file). + +``rows`` : *number* + The number of rows in the chunk table. + +``data_size`` : *number* + The number of bytes in the chunk table (measured by the size of the corresponding file). + +``index_size`` : *number* + The number of bytes in the index of the chunk table (measured by the size of + the corresponding file). + +.. _ingest-info-contrib-requests: + +Status of the contribution request +---------------------------------- + +The service of the **Master Replication Controller** returns information on a contribution request: + +.. list-table:: + :widths: 10 15 75 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/ingest/trans/contrib/:id`` + - | ``include_warnings=<0|1>`` + | ``include_retries=<0|1>`` + +Where: + +``id`` : *number* + The required unique identifier of the contribution request that was submitted + to a Worker Ingest service earlier. + +``include_warnings`` : *number* = ``0`` + The optional flag telling the service to include warnings into the response. Any value + that is not ``0`` is considered as ``1``, meaning that the warnings should be included. + +``include_retries`` : *number* = ``0`` + The optional flag telling the service to include retries into the response. Any value + that is not ``0`` is considered as ``1``, meaning that the retries should be included. + +The resulting object has the following schema: + +.. code-block:: + + { "contribution" : + } + +Where the detailed description on the enclosed contribution object is provided in the section: + +- :ref:`ingest-trans-management-descriptor-contrib-long` diff --git a/doc/ingest/api/reference/rest/controller/row-counters.rst b/doc/ingest/api/reference/rest/controller/row-counters.rst new file mode 100644 index 0000000000..0e2197a3ee --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/row-counters.rst @@ -0,0 +1,193 @@ + +Row counters +============ + +.. _ingest-row-counters-deploy: + +Collecting row counters and deploying them at Qserv +--------------------------------------------------- + +The service collects row counters in the specified table and (optionally) deploys the counters +in Qserv to allow optimizations of the relevant queries. The database may or may not be in +the published state at the time of this operation. + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/table-stats`` + +Where the request object has the following schema: + +.. code-block:: + + { "database" : , + "table" : , + "overlap_selector" : , + "force_rescan" : , + "row_counters_state_update_policy" : , + "row_counters_deploy_at_qserv" : + } + +Where: + +``database`` : *string* + The required name of a database affected by the operation. + +``table`` : *string* + The required name of the table for which the row counters are required to be collected. + +``overlap_selector`` : *string* = ``CHUNK_AND_OVERLAP`` + The optional selector for a flavor of the table for which the counters will be collected. + Possible options are: + + - ``CHUNK_AND_OVERLAP``: Both the chunk table itself and the overlap table. + - ``CHUNK``: Only the chunk table. + - ``OVERLAP``: Only the overlap table. + + **Note**: This parameter applies to the *partitioned* tables only. It's ignored for the *regular* (fully replicated) + tables. + +``force_rescan`` : *number* = ``0`` + The optional flag that tells the service to rescan the counters that were recorded earlier. + If the value is set to ``0`` the service will not rescan the counters if the previous version already exists. + If the value is set to ``1`` (or any other number) the service will rescan the counters regardless of + the previous version. + +``row_counters_state_update_policy`` : *string* = ``DISABLED`` + The optional parameter that drives the counters update policy within the persistent + state of the Replication/Ingest system. These are the possible options: + + - ``DISABLED``: The service will collect the counters but it will not update the persistent state. + - ``ENABLED``: Update the counters in the system if the scan was successful and if no counters were + recorded earlier. + - ``FORCED``: Same as ``ENABLED`` except it allows overriding the previous state of the counters. + +``row_counters_deploy_at_qserv`` : *number* = ``0`` + The optional flag tells the service if the counters should be deployed at Qserv. + If the value is set to ``0`` the service will not deploy the counters. Any other value would tell + the service to drop the previous version of the counters (if any existed) in Qserv and update the counters. + +.. _ingest-row-counters-delete: + +Deleting row counters for a table +---------------------------------- + +The service removes the previously collected row counters of the specified table from Qserv and (optionally if requested) +from the Replication system's persistent state. The database may or may not be published at the time of this operation: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/ingest/table-stats/:database/:table`` + +Where the service path has the following parameters: + +``database`` : *string* + The name of a database affected by the operation. + +``table`` : *string* + The name of the table for which the row counters are required to be collected. + +The request object sent in the JSON body has the following schema: + +.. code-block:: + + { "overlap_selector" : , + "qserv_only" : + } + +Where: + +``overlap_selector`` : *string* = ``CHUNK_AND_OVERLAP`` + The optional selector for a flavor of the table for which the counters will be collected. + Possible options are: + + - ``CHUNK_AND_OVERLAP``: Both the chunk table itself and the overlap table. + - ``CHUNK``: Only the chunk table. + - ``OVERLAP``: Only the overlap table. + + **Note**: This parameter applies to the *partitioned* tables only. It's ignored for the *regular* (fully replicated) + tables. + +``qserv_only`` : *number* = ``0`` + The optional flag tells the service if the counters should be removed + from Qserv and from the Replication system's persistent state as well: + + - ``0``: Remove the counters from both Qserv and the Replication system's persistent state. + - ``1`` (or any other number which is not ``0``): Remove the counters only from Qserv. + +.. _ingest-row-counters-inspect: + +Inspecting rows counters of a table +----------------------------------- + +The service retturns a status of the previously collected (if any) row counters of the specified table from +the Replication system's persistent state. The database may or may not be published at the time of this operation. + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/ingest/table-stats/:database/:table`` + +Where: + +``database`` : *string* + The name of a database affected by the operation. + +``table`` : *string* + The name of the table for which the row counters are required to be collected. + +The response returned by the service has the following JSON schema: + +.. code-block:: + + { "database" : , + "table" : , + "entries": [ + { "transaction_id" : , + "chunk" : , + "is_overlap" : , + "num_rows" : , + "update_time" : + }, + ... + ] + } + +Where: + +``database`` : *string* + The name of a database that was specified in the service resource path. + +``table`` : *string* + The name of the table was specified in the service resource path. + +``entries`` : *array* + The array of the collected row counters entries. + +``transaction_id`` : *number* + The unique identifier of a *super-transaction*. + +``chunk`` : *number* + The chunk number of the entry. + + **Note**: A value of ``0`` will be reported for the *regular* (fully-replicated) tables. + +``is_overlap`` : *number* + The flag indicates if the entry is reported for the chunk overlap (a value would differ from ``0``) + rather than for the chunk itself (a value would be ``0``). + + **Note**: The parameter should be ignored for the *regular* (fully-replicated) tables. + +``num_rows`` : *number* + The number of rows in in a scope of (``transaction_id``, ``chunk``, ``is_overlap``). + +``update_time`` : *number* + The last time the counter was collected. The time is given as the number of milliseconds since + the UNIX *Epoch* time. \ No newline at end of file diff --git a/doc/ingest/api/reference/rest/controller/table-location.rst b/doc/ingest/api/reference/rest/controller/table-location.rst new file mode 100644 index 0000000000..a80c13fa7e --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/table-location.rst @@ -0,0 +1,243 @@ + +.. _table-location: + +Table location services +======================= + +.. _table-location-regular: + +Locate regular tables +--------------------- + +.. warning:: + This service was incorrectly designed by requiring the name of a database (attribute ``database``) be passed + in the ``GET`` request's body. The same problem exists for the alternative method accepting a transaction identifier + (attribute ``transaction_id``). This is not a standard practice. The ``GET`` requests are not supposed to have the body. + The body may be stripped by some HTTP clients or proxies. Both problems will be fixed in the next releases of Qserv + by moving the parameters into the query part of the URL. + +The service returns connection parameters of the Worker Data Ingest Services which are available for ingesting +the regular (fully replicated) tables: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/ingest/regular`` + +Where the request object passed in a request's body has the following schema, in which a client would have to provide the name of a database: + +.. code-block:: + + { "database" : + } + +The database should not be published at a time when the request was being called. Otherwise the service will return an error. + +The service also supports an alternative method accepting a transaction identifier (transactions are always associated with +the corresponding databases): + +.. code-block:: + + { "transaction_id" : + } + +If the transaction identifier was provided then the transaction is required to be in the ``STARTED`` state at the time of a request. +See the section :ref:`ingest-trans-management` for more details on transactions. + +In case of successful completion the service returns the following object: + +.. code-block:: + + { "locations" : [ + { "worker" : , + "host" : , + "host_name" : , + "port" : , + "http_host" : , + "http_host_name" : , + "http_port" : + }, + ... + ] + } + +Where, each object in the array represents a particular worker. See an explanation of the attributes in: + +- :ref:`table-location-connect-params` + +**Note**: If the service will returns an empty array then Qserv is either not properly configured, +or it's not ready to ingest the tables. + +.. _table-location-chunks: + +Allocate/locate chunks of the partitioned tables +------------------------------------------------ + +The current implementation of the system offers two services for allocating (or determining locations of existing) chunks: + +- :ref:`table-location-chunks-one` +- :ref:`table-location-chunks-many` + +Both techniques are explained in the current section. The choice of a particular technique depends on the requirements +of a workflow. However, the second service is recommended as it's more efficient in allocating large quanities of chunks. + +Also note, that once a chunk is assigned (allocated) to a particular worker node all subsequent requests for the chunk are guaranteed +to return the same name of a worker as a location of the chunk. Making multiple requests for the same chunk is safe. Chunk allocation +requests require a valid super-transaction in the ``STARTED`` state. See the section :ref:`ingest-trans-management` for more details on transactions. + +.. _table-location-chunks-one: + +Single chunk allocation +~~~~~~~~~~~~~~~~~~~~~~~ + +The following service is meant to be used for a single chunk allocation/location: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/chunk`` + +Where the request object has the following schema, in which a client would have to provide the name of a database: + +.. code-block:: + + { "database" : , + "chunk" : + } + +The service also supports an alternative method accepting a transaction identifier (transactions are always associated with the corresponding databases): + +.. code-block:: + + { "transaction_id" : , + "chunk" : + } + +If a request succeeded, the System would respond with the following JSON object: + +.. code-block:: + + { "locations" : [ + { "worker" : , + "host" : , + "host_name" : , + "port" : , + "http_host" : , + "http_host_name" : , + "http_port" : + }, + ... + ] + } + +Where, the object represents a worker where the Ingest system requests the workflow to forward the chunk contributions. +See an explanation of the attributes in: + +- :ref:`table-location-connect-params` + +.. _table-location-chunks-many: + +Multiple chunks allocation +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For allocating multiple chunks one would have to use the following service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/chunks`` + +Where the request object has the following schema, in which a client would have to provide the name of a database: + +.. code-block:: + + { "database" : , + "chunks" : [, , ... ] + } + +Like the above-explained case of the single chunk allocation service, this one also supports an alternative method accepting +a transaction identifier (transactions are always associated with the corresponding databases): + +.. code-block:: + + { "transaction_id" : , + "chunks" : [, , ... ] + } + +**Note** the difference in the object schema - unlike the single-chunk allocator, this one expects an array of chunk numbers. + +The resulting object has the following schema: + +.. code-block:: + + { "locations" : [ + { "chunk" : , + "worker" : , + "host" : , + "host_name" : , + "port" : , + "http_host" : , + "http_host_name" : , + "http_port" : + }, + ... + ] + } + +Where, each object in the array represents a particular worker. See an explanation of the attributes in: + +- :ref:`table-location-connect-params` + +.. _table-location-connect-params: + +Connection parameters of the workers +------------------------------------- + +.. warning:: + In the current implementation of the Ingest system, values of the hostname attributes ``host_name`` and ``http_host_name`` are captured + by the worker services themselves. The names may not be in the FQDN format. Therefore this information has to be used with caution and + only in those contexts where the reported names could be reliably mapped to the external FQDN or IP addresses of the corresponding hosts + (or Kubernetes *pods*). + +Attributes of the returned object are: + +``chunk`` : *number* + The unique identifier of the chunk in Qserv. + + **Note**: This attribute is reported in the chunk location/allocation services: + + - :ref:`table-location-chunks` + +``worker`` : *string* + The unique identifier of the worker in Qserv. + + **Note**: The worker's identifier is not the same as the worker's host name. + +``host`` : *string* + The IP address of the worker's Ingest service that supports the proprietary binary protocol. + +``host_name`` : *string* + The DNS name of the worker's Ingest service that supports the proprietary binary protocol. + +``port`` : *number* + The port number of the worker's Ingest service that supports the proprietary binary protocol. This service requires + the content of an input file be sent directly to the service client. The Replication/Ingest system provides + an application :ref:`ingest-tools-qserv-replica-file` that relies on this protocol. + +``http_host`` : *string* + The IP address of the worker's Ingest service that supports the HTTP protocol. + +``http_host_name`` : *string* + The DNS name of the worker's Ingest service that supports the HTTP protocol. + +``http_port`` : *number* + The port number of the worker's Ingest service that supports the HTTP protocol. The REST server that's placed + in front of the service allows ingesting a single file from a variety of external sources, such as the locally + mounted (at the worker's host) filesystem, or a remote object store. It's also possible to push the content of a file + in the request body ether as teh JSON object or as a binary stream (``multipart/form-data``). diff --git a/doc/ingest/api/reference/rest/controller/trans-management.rst b/doc/ingest/api/reference/rest/controller/trans-management.rst new file mode 100644 index 0000000000..193e9e084b --- /dev/null +++ b/doc/ingest/api/reference/rest/controller/trans-management.rst @@ -0,0 +1,794 @@ +.. _ingest-trans-management: + +Transaction management +====================== + +.. note:: + + - The transaction management services which modify a state of transactions are available only to the authorized users. + The authorization is based on the authentication key. The key is used to prevent unauthorized access to the services. + + - The schema of the JSON object returned for each transaction is the same for all services in the group. + The schema is described in the section: + + - :ref:`ingest-trans-management-descriptor` + +.. _ingest-trans-management-status: + +Status of a transaction +----------------------- + +There are two services in this group. They are documented in the dedicated sections below. + +.. _ingest-trans-management-status-many: + +Database transactions +^^^^^^^^^^^^^^^^^^^^^ + +The service returns the information on many transactions in a scope of a database or databases selected via optional +filters passed via the request's query. The service is meant to be used by workflows for monitoring the status of +transactions and for debugging purposes. To see an actual progress of a transaction (e.g. to see the contributions +loaded into the destination table) a workflow should use the service: :ref:`ingest-trans-management-status-one`. + +.. list-table:: + :widths: 10 15 75 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/ingest/trans`` + - | ``database=`` + | ``family=`` + | ``all_databases={0|1}`` + | ``is_published={0|1}`` + | ``include_context={0|1}`` + | ``contrib={0|1}`` + | ``contrib_long={0|1}`` + | ``include_log={0|1}`` + | ``include_warnings={0|1}`` + | ``include_retries={0|1}`` + +Where: + +``database`` : *string* = ``""`` + The optional name of the database to filter the transactions by. If the parameter is present and if + it's not empty then attributes ``family``, ``all_databases`` and ``is_published`` are ignored. + +``family`` : *string* = ``""`` + The optional name of the database family. If the parameter is present and if + it's not empty then a scope of a request will be narrowed to databases - members of the given family. + Otherwise all databases regardless of their family membership will be considered. + + **Notes**: + + - The parameter is ignored if the parameter ``database`` is present. + - The final selection of the databases is also conditioned by the values of the optional parameters + ``all_databases`` and ``is_published``. See the description of the parameters for more details. + +``all_databases`` : *number* = ``0`` + The optional flag which is used for further filtering of databases selected by the parameter family. + A value of ``0`` tells the service that the parameter ``is_published`` should be used to further filter database + selection to the desired subset. Any other value would mean no additional filters (hence ignoring ``is_published``), + hence including databases selected by the parameter family. + + **Note**: The parameter is ignored if the parameter ``database`` is present. + +``is_published`` : *number* = ``0`` + The optional flag is used only if enabled by setting the previous parameter ``all_databases=0``. + A value of ``0`` tells the service to narrow the database selection to databases which are not *published*. + Any other value would select the *published* databases. + + **Note**: The parameter is ignored if the parameter ``database`` is present or when ``all_databases=1``. + +``include_context`` : *number* = ``0`` + The optional flag tells the service to include the transaction context object in the report for each transaction. + See the documentation on services :ref:`ingest-trans-management-start` or :ref:`ingest-trans-management-end` for further + details. + + .. warning:: + + Potentially, each context object could be as large as **16 MB**. Enable this option only if you really need + to see contexts for all transactions. Otherwise use an alternative (single transaction) request to pull one + transaction at a time. + +``contrib`` : *number* = ``0`` + The optional flag tells the service whether the transaction contribution objects should be included + into the report. See details on this flag in the dedicated section below. + + .. warning:: + + Even though individual contribution objects aren't large, the total number of contribution ingested + in a scope of each transaction (and all transactions of a database, etc.) could be quite large. + This would result in a significant emount of data reported by the service. In extreme cases, the response + object could be **1 GB** or larger. Enable this option only if you really need to see contributions + for selected transactions. Otherwise use an alternative (single transaction) request to pull one transaction + at a time: :ref:`ingest-trans-management-status-one`. + +``contrib_long`` : *number* = ``0`` + This optional flag is considered only if ``contrib=1``. Setting a value of the flag to any value other + than ``0`` will result in returning detailed info on the contributions. Otherwise (if a value of the parameter + is set to ``0``) only the summary report on contributions will be returned. + +``include_log`` : *number* = ``0`` + The optional flag tells the service to include the transaction log in the report for each transaction. + The log is a list of events that were generated by the system in response to the transaction management + reequests. Each entry in the log is a JSON object that includes the timestamp of the event, the event type, + etc. See **TODO** for the details on the log entries. + +``include_warnings`` : *number* = ``0`` + The optional flag, if set to any value that differs from ``0``, tells the service to include MySQL warnings + captured when loading contributions into the destination table. Warnings are reported in a context of + contributiond should they be allow in the report. + + **Note**: The parameter is ignored if ``contrib=0`` or if ``contrib_long=0``. + +``include_retries`` : *number* = ``0`` + The optional flag, if set to any value that differs from ``0``, tells the service to include the information + on the retries to load contributions that were made during the transaction. Retries are reported in a context of + contributiond should they be allow in the report. + + **Note**: The parameter is ignored if ``contrib=0`` or if ``contrib_long=0``. + +This is an example of the most typical request to the service for pulling info on all transactions of ``gaia_edr3``: + +.. code-block:: bash + + curl -X GET "http://localhost:25081/ingest/trans?database=gaia_edr3" + +The service will return a JSON object with the summary report on the transactions in the following JSON object: + +.. code-block:: json + + { + "success" : 1, + "warning" : "No version number was provided in the request's query.", + "error" : "", + "error_ext" : {}, + "databases" : { + "gaia_edr3" : { + "is_published" : 0, + "num_chunks" : 1558, + "transactions" : [ + { + "database" : "gaia_edr3", + "log" : [], + "start_time" : 1726026383559, + "end_time" : 0, + "begin_time" : 1726026383558, + "id" : 1632, + "state" : "STARTED", + "transition_time" : 0, + "context" : {} + }, + { + "end_time" : 1727826539501, + "context" : {}, + "begin_time" : 1726026383552, + "log" : [], + "transition_time" : 1727826539218, + "database" : "gaia_edr3", + "start_time" : 1726026383553, + "state" : "ABORTED", + "id" : 1631 + }, + { + "database" : "gaia_edr3", + "end_time" : 1727826728260, + "id" : 1630, + "transition_time" : 1727826728259, + "start_time" : 1726026383547, + "begin_time" : 1726026383546, + "log" : [], + "state" : "FINISHED", + "context" : {} + }, + +**Note**: that the report doesn't have any entries for the contributions. The contributions are not included in the report since +the parameter ``contrib`` was not set to ``1``. The log entries are also missing since the parameter ``include_log`` was not set to ``1``. +Also, the transaction context objects are not included in the report since the parameter ``include_context`` was not set to ``1``. + +.. _ingest-trans-management-status-one: + +Single transaction finder +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The service returns the information on a single transaction identified by its unique identifier ```` passed +via the request's query: + +.. list-table:: + :widths: 10 15 75 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/ingest/trans/`` + - | ``include_context={0|1}`` + | ``contrib={0|1}`` + | ``contrib_long={0|1}`` + | ``include_log={0|1}`` + | ``include_warnings={0|1}`` + | ``include_retries={0|1}`` + +Where the parameters are the same as for the service :ref:`ingest-trans-management-status-many`. + +This is an example of using the service for pulling info on a transaction ``1630`` and obtaining +the summary report on contributions and the transaction context: + +.. code-block:: bash + + curl -X GET "http://localhost:25881/ingest/trans/1630?contrib=1" + +The service returns a JSON object that has the following structure (the report is truncated by removing stats +on all workers but ``db12`` for brevity): + +.. code-block:: json + + { + "databases" : { + "gaia_edr3" : { + "num_chunks" : 1558, + "is_published" : 0, + "transactions" : [ + { + "id" : 1630, + "database" : "gaia_edr3", + "end_time" : 1727826728260, + "start_time" : 1726026383547, + "begin_time" : 1726026383546, + "transition_time" : 1727826728259, + "log" : [], + "context" : {}, + "state" : "FINISHED", + "contrib" : { + "summary" : { + "num_failed_retries" : 0, + "num_chunk_files" : 156, + "last_contrib_end" : 1726026945059, + "num_regular_files" : 0, + "num_rows" : 223420722, + "table" : { + "gaia_source" : { + "num_failed_retries" : 0, + "overlap" : { + "num_rows" : 6391934, + "num_warnings" : 0, + "num_rows_loaded" : 6391934, + "data_size_gb" : 5.97671127319336, + "num_files" : 155, + "num_failed_retries" : 0 + }, + "num_files" : 156, + "num_rows_loaded" : 217028788, + "num_warnings" : 0, + "data_size_gb" : 201.872497558594, + "num_rows" : 217028788 + } + }, + "num_workers" : 9, + "first_contrib_begin" : 1726026383616, + "num_rows_loaded" : 223420722, + "worker" : { + "db12" : { + "num_failed_retries" : 0, + "num_regular_files" : 0, + "num_chunk_files" : 18, + "num_rows_loaded" : 52289369, + "num_warnings" : 0, + "data_size_gb" : 48.6947402954102, + "num_chunk_overlap_files" : 23, + "num_rows" : 52289369 + }, + }, + "num_warnings" : 0, + "num_files_by_status" : { + "LOAD_FAILED" : 0, + "IN_PROGRESS" : 0, + "CANCELLED" : 0, + "CREATE_FAILED" : 0, + "READ_FAILED" : 0, + "FINISHED" : 311, + "START_FAILED" : 0 + }, + "num_chunk_overlap_files" : 155, + "data_size_gb" : 207.849166870117 + }, + "files" : [] + } + }, + +**Note**: the report doesn't have any entries for individual contributions in the attribute ``files``. Only the summary info +in the attribute ``summary`` is provided. + + +.. _ingest-trans-management-start: + +Start a transaction +------------------- + +Transactions are started by this service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/trans`` + +The following JSON object is required to be sent in the body of a request: + +.. code-block:: + + { "database" : , + "context" : + } + +Where: + +``database`` : *string* + The required name of the database definintg a scope of the new transaction. + +``context`` : *object* = ``{}`` + The optional arbitrary workflow-defined object to be stored in the persistet state of + the Ingest System for the transaction. It's up to the workflow to decide what to store in the object. + For exaqmple, this information could be used later for recovering from errors during the ingest, for + general bookkeeping, data provenance, visualization purposes, etc. A value of this attribute, if provided, + must be a valid JSON object. The object could be empty. + + **Note**: The current implementation of the Qserv Ingest system limits the size of the context object by **16 MB**. + +In case of successfull completion of a request (see :ref:`ingest-general-error-reporting`) the service will return +the JSON object with a description of the new transaction: + +.. code-block:: + + { + "databases" : { + : { + "num_chunks" : , + "transactions" : [ + { + "begin_time" : , + "context" : {...}, + "database" : , + "end_time" : , + "id" : , + "log" : [], + "start_time" : , + "state" : "STARTED", + "transition_time" : + } + ] + } + }, + "success" : , + ... + } + } + +Where the attribute ``id`` representing a unique identifier of the transaction is the most important attribute +found in the object. A alue of the identifier needs to be memorized by a workflow to be used in the subsequent +requests to the transaction management services. + +The attribute ``start_time`` will be set to the current time in milliseconds since the UNIX *Epoch*. +And the state of the new transaction will be set to ``STARTED``. The ``end_time`` will be ``0``. A value of +the attribute ``context`` will be the same as it was provided on the input to the service, or the default +value if none was provided. + +.. _ingest-trans-management-end: + +Commit or abort a transaction +----------------------------- + +.. note:: + + - The current design of the service is not correct. The ``abort`` flag should be passed in the request's query + rather than in the body of the request. The service will be updated in the future to reflect the correct design. + + +Transactions are aborted or committed by the following service: + +.. list-table:: + :widths: 10 25 65 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``PUT`` + - ``/ingest/trans/:id`` + - ``abort=<0|1>`` + +A unique identifier of the transaction is passed into the service in the resource's path parameter ``id``. +The only mandatory parameter of the request query is ``abort``. The value of the parameter is ``0`` to tell the services +that the transaction has to be committed normally. Any other number will be interpreted as a request to abort the transaction. + +Other parameters defining a request are passed via the request's body: + +.. code-block:: + + { + "context" : + } + +Where: + +``context`` : *object* = ``{}`` + The optional arbitrary workflow-defined object to be stored in the persistet state of + the Ingest System for the transaction. It's up to the workflow to decide what to store in the object. + For exaqmple, this information could be used later for recovering from errors during the ingest, for + general bookkeeping, data provenance, visualization purposes, etc. A value of this attribute, if provided, + must be a valid JSON object. The object could be empty. + + **Notes**: + + - A value provided in the attribute will replace the initial value specified (if any) at the transaction + start time (see :ref:`ingest-trans-management-start`). + - The current implementation of the Qserv Ingest system limits the size of the context object by **16 MB**. + +Upon successful completion of either request (see :ref:`ingest-general-error-reporting`) the service would return an updated +status of the transaction in a JSON object as it was explained in the section :ref:`ingest-trans-management-start`. + +State transitions of the transactions: + +- Aborted transactions will end up in the ``ABORTED`` state. +- Transactions that were committed will end up in the ``FINISHED`` state. +- In case of any problems encountered during an attempt to end a transaction, other states may be also reported + by the service. + +It's also safe to repeat either of the requests. The service will complain if the transaction won't be in +the ``STARTED`` state at a time when the request was received by the service. + +More information on the statuses of transactions can be found at: + +- :ref:`ingest-trans-management-status` + +.. _ingest-trans-management-descriptor: + +Transaction descriptor +---------------------- + +.. note:: + + This section uses a database ``gaia_edr3`` and transaction ``1630`` as an example. + +The content of a JSON object returned by the services varies depending on a presense of the optional parameters: + +- ``include_context={0|1}`` +- ``contrib={0|1}`` +- ``contrib_long={0|1}`` +- ``include_log={0|1}`` +- ``include_warnings={0|1}`` +- ``include_retries={0|1}`` + +Subsections below describe the gradual expantion of the JSON object returned by the services as the optional parameters +are set to ``1``. + +.. _ingest-trans-management-descriptor-short: + +Shortest form +^^^^^^^^^^^^^ + +The shortest form of the JSON object returned by the services when all optional parameters are set to ``0`` is: + +.. code-block:: + + { + "databases" : { + "gaia_edr3" : { + "is_published" : <0|1>, + "num_chunks" : , + "transactions" : [ + { + "id" : 1630, + "database" : "gaia_edr3", + "begin_time" : , + "start_time" : , + "end_time" : , + "transition_time" : , + "state" : , + "context" : , + "log" : + }, + +Where: + +``is_published`` : *number* + The flag tells whether the database is *published* or not. + +``num_chunks`` : *number* + The total number of chunks in the database, regardless if any contributons were made into the chunks + in a context of any transaction. Chunks need to be registered in Qserv before the corresponding MySQL tables + can be populated with data. This information is meant to be used for the monitoring and Q&A purposes. + +``id`` : *number* + The unique identifier of the transaction. + +``database`` : *string* + The name of the database the transaction is associated with. + +``begin_time`` : *number* + The timestamp of the transaction creation in milliseconds since the UNIX *Epoch*. The value is + set by the service when the transaction is registered in the system and assigned + a state ``IS_STARTING``. The value is guaranteed to be not ``0``. + +``start_time`` : *number* + The timestamp of the transaction start in milliseconds since the UNIX *Epoch*. The value is + set by the service when the transaction is started (gets into the ``STARTED`` state). + The value is ``0`` while while teh transaction is still in a state ``IS_STARTING``. + +``end_time`` : *number* + The timestamp of the transaction end in milliseconds since the UNIX *Epoch*. The value is + set by the service when the transaction is ended (committed, aborted or failed). A value + of the atrribite is ``0`` if the transaction is still active. + +``transition_time`` : *number* + The timestamp of the last state transition in milliseconds since the UNIX *Epoch*. The value is + set by the service when the transaction gets into states ``IS_FINISHING`` (the committing process + was initiated) or ``IS_ABORTING`` (the aborting process was initiated). The value would be set + to ``0`` before that. + +``state`` : *string* + The current state of the transaction. The possible values and their meanings are explained in + the dedicated section: + + - :ref:`ingest-trans-management-states` + +``context`` : *object* + The object that was provided by a workflow at the transaction start time, or updated during transaction + commit/abort time. The object could be empty. The object could be used for the recovery from errors during + the ingest, for general bookkeeping, data provenance, visualization purposes, etc. + +``log`` : *array* + The array of log entries. Each entry is a JSON object that has the following attributes: + + - ``id`` : *number* - The unique identifier of the log entry. + - ``transaction_state`` *string* : - The state of the transaction at the time the log entry was generated. + - ``name`` : *string* - The name of the event that triggered the log entry. + - ``time`` : *number* - The timestamp of the event in milliseconds since the UNIX *Epoch*. + - ``data`` : *object* - The data associated with the event. + +.. _ingest-trans-management-descriptor-contrib-summary: + +With a summary of contributions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Setting the query parameters to ``contrib=1`` (regardless if ``contrib_long`` is set to ``0`` or ``1``) +will result in expaning the ``transaction`` block with the ``summary`` object. The object will +include the summary info on all contributions made in a sewcope of the transaction. + +The following object illustrates the idea (where most of the previous explained attributes and all +worker-level stats but the one for ``db12`` are omitted for brevity): + +.. code-block:: + + "transactions" : [ + { + "contrib" : { + "summary" : { + "first_contrib_begin" : 1726026383616, + "last_contrib_end" : 1726026945059, + "num_rows" : 223420722, + "num_rows_loaded" : 223420722, + "num_regular_files" : 0, + "num_chunk_files" : 156, + "num_failed_retries" : 0, + "num_workers" : 9, + "table" : { + "gaia_source" : { + "data_size_gb" : 201.872497558594, + "num_rows_loaded" : 217028788, + "num_rows" : 217028788, + "num_files" : 156, + "num_failed_retries" : 0, + "num_warnings" : 0 + "overlap" : { + "data_size_gb" : 5.97671127319336, + "num_rows" : 6391934, + "num_rows_loaded" : 6391934, + "num_files" : 155, + "num_failed_retries" : 0, + "num_warnings" : 0 + } + } + }, + "worker" : { + "db12" : { + "data_size_gb" : 48.6947402954102, + "num_rows" : 52289369, + "num_rows_loaded" : 52289369, + "num_regular_files" : 0, + "num_chunk_files" : 18, + "num_chunk_overlap_files" : 23, + "num_failed_retries" : 0, + "num_warnings" : 0, + }, + } + } + +The ``summary`` object includes 3 sets of attributes: + +- The general stats on the contributions made in a scope of the transaction. +- The stats on the contributions made into the table ``gaia_source`` across all workers. +- The stats on the contributions made into into tables by the worker ``db12``. + +These are the general (transaction-level) stats: + +``first_contrib_begin`` : *number* + The timestamp of the first contribution in milliseconds since the UNIX *Epoch*. This is the time when a processing of the contribution started. + +``last_contrib_end`` : *number* + The timestamp of the last contribution in milliseconds since the UNIX *Epoch*. This is the time when a processing of the contribution ended. + +``num_rows`` : *number* + The total number of rows parsed in all input contributions made in a scope of the transaction. + +``num_rows_loaded`` : *number* + The total number of rows that were actually loaded into the destination table(s) in all contributions made in a scope of the transaction. + + **Note**: Normally the number of rows loaded should be equal to the number of rows parsed. If the numbers differ it means that some + rows were rejected during the ingest process. The workflow should be always monitoring any mismatches in these values and trigger alerts. + +``num_regular_files`` : *number* + The total number of regular files (not chunk files) parsed in all input contributions. + +``num_chunk_files`` : *number* + The total number of chunk files parsed in all input contributions. + +``num_failed_retries`` : *number* + The total number of retries that failed during the ingest process. + + **Note**: In most cases it's okay that the number of failed retries is not zero. The system is designed to retry + the ingest of the failed contributions. A problem is when the number of such failures detected in the scope of + a single contribution exceeds a limit set at the Ingest system. The workflow should be always monitoring + the number of failed retries and trigger alerts if the number is too high. + +``num_workers`` : *number* + The total number of workers that were involved in the ingest process. + + +.. _ingest-trans-management-descriptor-contrib-long: + +With detailed info on contributions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Setting the query parameters to ``contrib=1`` and ``contrib_long=1`` will result in expaning the ``contrib`` object +with the ``files`` array. Each entry (JSON object) in the array represents a contribution. The objects provides +the detailed info on all contributions made in a scope of the transaction: + +.. code-block:: + + "transactions" : [ + { + "contrib" : { + "files" : [ + , + ... + + ] + } + } + ] + +The schema of the contribution objects is covered by: + +- :ref:`ingest-worker-contrib-descriptor` + +**Note**: Extended info on warnings and retries posted during contribution loading are still disabled in this case. +To enable warnings use the parameter ``include_warnings=1``. To enable retries use the parameter ``include_retries=1``. + +.. _ingest-trans-management-states: + +Transaction states +------------------ + +Transactions have well-defined states and the state transition algorithm. Normally, Ingest System moves a transaction +from one state to another in response the explicit transaction management requests made by a workflow. In some cases +the Replication/Ingest system may also change the states. + +.. image:: /_static/ingest-transaction-fsm.png + :target: ../../../../../_images/ingest-transaction-fsm.png + :alt: State Transition Diagram + +A few comments on the diagram: + +- States ``STARTED``, ``FINISHED`` and ``ABORTED`` which are shown in grey boxes are the *intended* stable states of + a transaction. These states are *expected* to be reached by the transactin management requests. +- States ``START_FAILED``, ``FINISH_FAILED`` and ``ABORT_FAILED`` which are shown in red are the *unintended* intermediate + stable states of a transaction. the transaction gets into these states when the system encounters problems during + processing of the corresponding transaction management requests. Transitions into these states are shown as red dashed lines. + The only way to get out of these states is to fix the underlying problem (could be a problem with an infrastructure, data + or bugs in the Inges system or Qserv) and issue another transaction management request to *abort* the transaction. + + .. hint:: + + - In many cases a reason of the failure is reported in the response object returned by the corresponding transaction + management request. + +- States ``IS_STARTING``, ``IS_FINISHING`` and ``IS_ABORTING`` are the *transient* unstable states which are meant to be + passed through by a transaction on its way to the desired *intended* stable state. The states are used by the system + to indicate a significant (and often - lengthy) transformation of the data or metadata triggered by the state transition + of the transaction. + + - In some cases the transaction may be staying on one of these states for a while. For example, when the *commit* request + was initiated for the transaction and if the database options specified by a workflow require the system to build + the *director* index of the *director* table at the *commit* time of the transactions. The system will keep + the transaction in the state ``IS_FINISHING`` until the index is built. The state will be changed to ``FINISHED`` + once the index is built successfully. + + - It's possible that a transaction may get stuck in one of these *transient* states. The only scenario when this may + happen in the current implementation would be when the Master Replication Controller gets restarted while the transaction + is in one of these states. The system will not be able to resume the transaction processing after the restart. + This limitation will be addresed in the future. + + +The following table explains possible state transitions of a transaction: + +.. list-table:: + :widths: 10 80 10 + :header-rows: 1 + + * - state + - description + - next states + * - ``IS_STARTING`` + - The initial (transient) state assigned to a transaction right after it's registered in the system + in response to a request to start a transaction: :ref:`ingest-trans-management-start`. + This transient state that should be changed to ``STARTED`` or ``START_FAILED``. + The former state is assigned to a transaction that was successfully started, the latter + to a transaction that failed to start. + + - | ``START`` + | ``START_FAILED`` + + * - ``STARTED`` + - The active state of a transaction that is ready to accept data ingest requests. + When the system receives a request to commit or abort the transaction (see :ref:`ingest-trans-management-end`) + the state would transition to the corresponding transient states ``IS_FINISHING`` or ``IS_ABORTING``. + - | ``IS_FINISHING`` + | ``IS_ABORTING`` + + * - ``IS_FINISHING`` + - The transient state assigned to a transaction that is in the process of being committed. + Depending on the database options specified by a workflow, the transaction may stay in this state + for a while. + The state will change to ``FINISHED`` in case of the succesfull completion of a request, or it may + land in in the ``FINISH_FAILED`` state in case of any problems en countered during the request + execution. A transaction may also get into the ``IS_ABORTING`` state if a workflow issues the abort + request while the transaction is being finished. + + - | ``FINISHED`` + | ``FINISH_FAILED`` + | ``IS_ABORTING`` + + * - ``IS_ABORTING`` + - The transitional state triggered by the transaction abort request (see :ref:`ingest-trans-management-end`). + - | ``ABORTED`` + | ``ABORT_FAILED`` + + * - ``FINISHED`` + - The final state of a transaction that was successfully committed. + - + + * - ``ABORTED`` + - The final state of a transaction that was successfully aborted. + - + + * - ``START_FAILED`` + - The (inactive) state of a transaction that failed to start. The state allows + a workflow to initiate the transaction abort request. + - ``IS_ABORTING`` + + * - ``FINISH_FAILED`` + - The (inactive) state of a transaction that failed to to be commited. The state allows + a workflow to initiate the transaction abort request. + - ``IS_ABORTING`` + + * - ``ABORT_FAILED`` + - The (inactive) state of a transaction that failed to to be aborted. The state allows + a workflow to initiate another transaction abort request (or requests). + - ``IS_ABORTING`` + diff --git a/doc/ingest/api/reference/rest/general.rst b/doc/ingest/api/reference/rest/general.rst new file mode 100644 index 0000000000..85603dd920 --- /dev/null +++ b/doc/ingest/api/reference/rest/general.rst @@ -0,0 +1,314 @@ +General guidelines +================== + +.. _ingest-general-request-headers: + +Request headers +--------------- + +All (but those where it's explicitly stated otherwise) services accepting requests send with ``POST``, ``PUT`` or ``DELETE`` +methods require the following HTTP header to be sent in the request's body along with the JSON request object: + +.. code-block:: + + Content-Type: application/json + +When requests are sent using the command line application ``curl`` then the following option must be used: + +.. code-block:: bash + + curl -X -H "Content-Type: application/json" + +In this case a JSON object can be specified using one of the following methods: + +.. code-block:: bash + + echo '{...}' | curl -X -H
-d@- + curl -X -H
-d '{...}' + +Where ``{...}`` represents a JSON object with details of the request. The object may not be required for some requests. +Specific requirements for this will be mentioned in each service. If the object is not required for a for particular +request then the body is allowed to be empty, or it could be an empty JSON object ``{}``. + +All (no exception) services return results and errors as JSON objects as explained in the next subsection below. + +.. _ingest-general-error-reporting: + +Error reporting when calling the services +----------------------------------------- + +.. note: + + The error reporting mechanism implemented in the System serves as a foundation for building reliable workflows. + +All services explained in the document adhere to the usual conventions adopted by the Web community for designing and using the REST APIs. In particular, HTTP code 200 is returned if a request is well-formed and accepted by the corresponding service. Any other code shall be treated as an error. However, the implementation of the System further extends the error reporting mechanism by guaranteeing that all services did the fine-grain error reporting in the response objects. All services of the API are guaranteed to return an JSON object if the HTTP code is 200. The objects would have the following mandatory attributes (other attributes depend on a request): + +.. code-block:: + + { "success" : , + "error" : , + "error_ext" : , + ... + } + +**Note**: depending on the service, additional attributes may be present in the response object. + +Therefore, even if a request is completed with HTTP code ``200``, a client (a workflow) must inspect the above-mentioned +fields in the returned object. These are the rules for inspecting the status attributes: + +- Successful completion of a request is indicated by having success=1 in the response. In these cases, the other + two fields should be ignored. +- Otherwise, a human-readable explanation of a problem would be found in the error field. +- Request-specific extended information on errors is optionally provided in the error_ext field. + +Optional warnings +^^^^^^^^^^^^^^^^^ + +**Note**: Warnings were introduced as of version ``12`` of the API. + +REST services may also return the optional attribute ``warning`` a caller about potential problems with a request. +The very presence of such a warning doesn't necessarily mean that the request failed. Users are still required +to use the above-described error reporting mechanism for inspecting the completion status of requests. +Warnings carry the additional information that may be present in any response regardless if it succeeded or not. +It's up to a user to interpret this information based on a specific request and the context it was made. + +Here is what to expect within the response object if the warning was reported: + +.. code-block:: + + { "success" : , + "warning" : , + ... + } + +.. _ingest-general-auth: + +Authorization and authentication +-------------------------------- + +All services accepting requests sent with ``POST``, ``PUT`` or ``DELETE`` methods require the following attribute +to be present in the request object: + +``auth_key`` : *string* + The authentication key that is required for an operation. The key is used to prevent unauthorized access to the service. + +Certain requests (where it's specificly stated by the description of the service) may require the elevated privileges +to be specified in the following attribute: + +``admin_auth_key`` : *string* + The Administrator-level authentication key that is required for an operation. The key is used to prevent unauthorized + access to the service that will modify existing data visible to Qserv users. + +.. _ingest-general-versioning: + +Protocol Versioning +------------------- + +The API adheres to the optional version control mechanism introduced in: + +- https://rubinobs.atlassian.net/browse/DM-35456 + +Workflow developers are encouraged to use the mechanism to reinforce the integrity of the applications. + +There are two ways the workflows can use the version numbers: + +- *pull mode*: Ask the Replication Controller explicitly what version it implements and cross-check the returned + version versus a number expected by the application. +- *push mode*: Pass the expected version number as a parameter when calling services and let + the services verify if that version matches one of the frontend implementations. + +Workflow developers are free to use neither, either of two, or both methods of reinforcing their applications. + +Pull mode +^^^^^^^^^ + +To support the first scenario, the API provides a special metadata service that will return +the version number (along with some other information on the frontend): + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/meta/version`` + +The request object for this request is not required, or it could be an empty JSON object ``{}``. +In case of its successful completion, the service will return a JSON object that will include +the following attributes (along with the other standard attributed that are used for error reporting): + +.. code-block:: + + { "kind" : , + "name" : , + "id" : , + "instance_id" : , + "version" : , + "database_schema_version" : , + "success" : , + "warning" : , + "error" : , + "error_ext" : + } + +Where, the service-specific attributes are: + +``kind`` : *string* + The name of the service. The following name is always reported: + + .. code-block:: + + replication-controller + +``name`` : *string* + The unique name of the frontend within a given Qserv. The current implementation will always return: + + .. code-block:: + + http + +``id`` : *number* + A unique identifier of the Replication Controller. The number returned here may vary. + +``instance_id`` : *string* + An identifier of the Qserv instance. A value of the attribute depends on a particular deployment of Qserv. + +``version`` : *number* + The current version number of the API. + +``database_schema_version`` : *number* + The schema version number of the Replication System's Database. + +Example: + +.. code-block:: json + + { "kind" : "replication-controller", + "id" : "9037c818-4820-4b5e-9219-edbf971823b2", + "instance_id" : "qserv_proj", + "version" : 27, + "database_schema_version" : 14, + "success" : 1, + "error" : "", + "error_ext" : {}, + "warning" : "" + } + +Push mode +^^^^^^^^^ + +In the case of the second scenario, an application will pass the desired version number as +a request parameter. The number would be a part of the request's query for the method. For example, +the following request for checking the status of the ongoing query might look like this: + +.. code-block:: bash + + curl 'http://localhost:25004/trans/contrib/1234?version=35' -X GET + +For other HTTP methods used by the API, the number is required to be provided within the body +of a request as shown below: + +.. code-block:: bash + + curl 'http://localhost:25004/trans/contrib' -X POST \ + -H 'Content-Type: application/json' \ + -d'{"version":35, ..."}' + +If the number does not match expectations, such a request will fail and the service return the following +response. Here is an example of what will happen if the wrong version number ``29`` is specified instead +of ``35`` (as per the current version of the API): + +.. code-block:: json + + { "success" : 0, + "error" : "The requested version 29 of the API is not in the range supported by the service.", + "error_ext": { + "max_version" : 35, + "min_version" : 32 + }, + "warning" : "" + } + +.. _ingest-general-binary-encoding: + +Binary encoding of the data in JSON +----------------------------------- + +The API supports encoding of the binary data into JSON. The encoding specification is provided as a parameter +``binary_encoding`` when calling several services. The parameter may be optional and if not provided, the default +value is ``hex``. The parameter is used by the services and by the client applications in two different ways: + +- When a client is sending data to a service, the client is required to tell the service how the binary data are encoded. + The service would invoke the corresponding decoding algorithm to decode the data into the original representation. + +- A service designed for sending data to a client is expected to get the name of the desired encoding + algorithm in a request to the service. The service would then encode the binary data into the JSON object + using the specified algorithm. + +The following options for the values of the parameter are allowed in the current version of the API: + +- ``hex`` - for serializing each byte into the hexadecimal format of 2 ASCII characters per each byte + of the binary data, where the encoded characters will be in a range of ``0 .. F``. In this case, + the encoded value will be packaged into the JSON string. +- ``b64`` - for serializing bytes into a string using the ``Base64`` algorithm with padding (to ensure 4-byte alignment). +- ``array`` - for serializing bytes into the JSON array of numbers in a range of ``0 .. 255``. + +Here is an example of the same sequence of 4-bytes encoded into the hexadecimal format: + +.. code-block:: + + 0A11FFD2 + +The array representation of the same binary sequence would look like this: + +.. code-block:: json + + [10,17,255,210] + +MySQL types (regardless of the case) that include the following keywords are treated as binary: + +.. code-block:: sql + + BIT + BINARY + BLOB + +For example, these are the binary types: + +.. code-block:: sql + + BIT(1) + BINARY(8) + VARBINARY(16) + TINYBLOB + BLOB + MEDIUMBLOB + LONGBLOB + + +.. _ingest-general-base-table-names: + +Base versus final table names +----------------------------- + +In descriptions of several services, the documentation uses an adjective *base* when referring to tables affected +by requests to the services. In reality, those *base* tables are exactly the names of the Qserv tables as they are seen +by Qserv users. In the distributed realm of Qserv each such table is collectively represented by many *final* tables +distributed across Qserv worker nodes. The names of the *final* tables depend on the table type: + +- *regular* (fully replicated) tables have the same name as the *base* table +- *partitioned* (chunked) tables have names constructed using the *base* name and the chunk numbers and values + of the overlap attribute of the table. + +Formally, the names of the *final* tables are constructed as follows: + +.. code-block:: + + = | _ | FullOverlap_ + +For example: + +.. code-block:: + + Filter + Object_1234 + ObjectFullOverlap_1234 diff --git a/doc/ingest/api/reference/rest/index.rst b/doc/ingest/api/reference/rest/index.rst new file mode 100644 index 0000000000..84746cc61a --- /dev/null +++ b/doc/ingest/api/reference/rest/index.rst @@ -0,0 +1,10 @@ +############# +REST Services +############# + +.. toctree:: + :maxdepth: 3 + + general + controller/index + worker/index diff --git a/doc/ingest/api/reference/rest/worker/index.rst b/doc/ingest/api/reference/rest/worker/index.rst new file mode 100644 index 0000000000..32884f83ab --- /dev/null +++ b/doc/ingest/api/reference/rest/worker/index.rst @@ -0,0 +1,915 @@ +#################### +Worker Ingest Server +#################### + +.. note:: + + Services explained in this section are provided directly by Qserv workers, not by the main REST server of + the Master Replication Controller. Each Qserv worker runs a dedicated Ingest Server that is reponsible for + ingesting and managing catalogs located on the coresponding worker. Ingest workflows interact directly with + workers using this API. The DNS names (IP addresses) of the corresponding hosts and the relevant port numbers + of the worker services are returned by requests sent to the Master Replication Controller's services: + + - :ref:`table-location-chunks` + - :ref:`table-location-chunks-one` + - :ref:`table-location-regular` + +.. _ingest-worker-contrib-by-ref: + +Ingesting contributions by reference +==================================== + +Contribution ingest requests can be initiated using one of these techniques: + +- *synchronous processing*: a client will get blocked for the duration of the request before it finishes (or failed) + to be executed. After that, the client would have to analyze the final state of the request from a response sent + by the service. +- *asynchronous processing*: a client will not be blocked. Once the request's parameters were successfully parsed and + analyzed (and accepted), the request will be queued for asynchronous processing. After that, the service will send back + a response with the unique identifier and the current status of the request. The workflow will have to use the identifier + to track the progression of the request. For requests that failed the validation stage information on reasons for + the failure will be returned. + +The following REST services implement these protocols: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - | ``/ingest/file`` + | ``/ingest/file-async`` + +The services expect a caller to provide a description of a request in the request's body in a form of a JSON object. +The object should adhere to the following schema: + +.. code-block:: + + { + "transaction_id" : , + "table" : , + "chunk" : , + "overlap" : , + + "url" : , + + "fields_terminated_by" : , + "fields_enclosed_by" : , + "fields_escaped_by" : , + "lines_terminated_by" : , + "charset_name" : , + + "http_method" : , + "http_data" : , + "http_headers" : , + + "max_num_warnings" : , + "num_retries" : + } + + +Where: + +``transaction_id`` : *number* + The required unique identifier of a transaction that is required to be in the ``STARTED`` state + at a time when a request is received by a service. More information on the transaction management and transaction + states can be found in: :ref:`ingest-trans-management`. + +``table`` : *string* + The required *base* name of a table receiving the contribution. See :ref:`ingest-general-base-table-names` for more details + on the meaning of the attriute *base* in a context of this API. + +``chunk`` : *number* + The required chunk number for the partitioned tables. + + **Note**: ignored for the *regular* tables. + +``overlap`` : *number* + The required numeric flag indicates a kind of partitioned table (``0`` if this is not the *overlap* + table or any other number of this is the *overlap* table). + + **Note**: ignored for the *regular* tables. + +``url`` : *string* + The required location of a file to be ingested. The current implementation supports the following schemes: + + - ``file:///``: A file on a filesystem that is mounted locally on the corresponding worker. Note that + the file path must be absolute. See details on this subject at: https://en.wikipedia.org/wiki/File_URI_scheme. + + - ``http://``, ``https://``: A file on a web server. For either of these schemes, additional + attributes (if needed) for pulling a file over the specified protocol could be provided in optional parameters: + ``http_method``, ``http_data`` and ``http_headers``. Descriptions of the parameters are provided below in this table. + + **Note**: Workflows may also configure the behavior of the ``libcurl`` library by settting the library-specific + options at a level of a database. See instructions at: :ref:`ingest-config`. + +``fields_terminated_by`` : *string* = ``\t`` + The optional parameter of the desired CSV dialect: a character that separates fields in a row. + The dafault value assumes the tab character. + +``fields_enclosed_by`` : *string* = ``""`` + The optional parameter of the desired CSV dialect: a character that encloses fields in a row. + The default value assumes no quotes around fields. + +``fields_escaped_by`` : *string* = ``\\`` + The optional parameter of the desired CSV dialect: a character that escapes special characters in a field. + The default value assumes two backslash characters. + +``lines_terminated_by`` : *string* = ``\n`` + The optional parameter of the desired CSV dialect: a character that separates rows. + The default value assumes the newline character. + +``charset_name`` : *string* = ``latin1`` + The optional parameters specify the desired character set name to be assumed when ingesting + the contribution. The default value may be also affected by the ingest services configuration. + See the following document for more details: + + - **TODO**: A reference to the page "Specifying character sets when ingesting tables into Qserv" + +``http_method`` : *string* = ``GET`` + The optional method that is used to pull a file over the HTTP protocol. + +``http_data`` : *string* = ``""`` + The optional data that is sent in the body of the HTTP request. + The default value assumes no data are sent. + +``http_headers`` : *string* = ``""`` + The optional list of headers that are sent in the HTTP request. + The default value assumes no headers are sent. A value of the parameters is a string that contains + zero, one or many headers definition string separated by a colon, where each such definition should look like: + + .. code-block:: + + : + +``max_num_warnings`` : *number* = ``64`` + The optional limit for the number of notes, warnings, and errors to be retained by MySQL/MariaDB when + loading the contribution into the destination table. + + **Note**: The default number of the limit is determined by a configuration of the ingest services. + The default value of the parameter in MySQL/MariaDB is ``64``. The upper limit for the parameter is ``65535``. + Some workflows may choose to set a specific value for the limit when debugging data of the contributions. + + **TODO**: "Configuration Guide for the Replication/Ingest System" (a reference to the page) + +``num_retries`` : *number* : **optional** + The optional number of automated retries of failed contribution attempts in cases when + such retries are still possible. The limit can be further limited by the ingest service to a value that will + not exceed the "hard" limit set in the worker configuration parameter (``worker``, ``ingest-max-retries``). + Setting a value of the parameter to ``0`` will explicitly disable automatic retries regardless of the server's + configuration settings. + + **Notes**: + + - The parameter is ignored by the *synchronous* service. + - The default number of retries set in the Inget Server's configuration parameter + (``worker``, ``ingest-num-retries``) will be assumed. + + **TODO**: "Configuration Guide for the Replication/Ingest System" (a reference to the page) + +The service will return the following JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +.. _ingest-worker-contrib-by-val: + +Ingesting contributions by value +================================ + +Contributions can be also ingested by sending data directly to the worker server in the request body. There are two sevices +in this category. Both techniques are *synchronous* and the client will be blocked until the request is processed: + +- sending data as a JSON object +- sending data as a ``CSV`` file in the ``multipart/form-data`` formatted body + +Each technique has its own pros and cons. + +.. _ingest-worker-contrib-by-val-json: + +JSON object +----------- + +The following service allows a workflow to push both data and a description of the contribution request as a JSON object: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/data`` + +The service expects a caller to provide a description of a request in the request's body in a form of a JSON object: + +.. code-block:: + + { + "transaction_id" : , + "table" : , + "chunk" : , + "overlap" : , + "charset_name" : , + "binary_encoding" : , + "max_num_warnings" : , + "rows" : [ + , + ... + + ] + } + +Where: + +``transaction_id`` : *number* + The required unique identifier of a transaction that has to be in the ``STARTED`` state + at a time when a request is received by a service. More information on the transaction management and transaction + states can be found in: :ref:`ingest-trans-management`. + +``table`` : *string* + The required *base* name of a table receiving the contribution. See :ref:`ingest-general-base-table-names` for more details + on the meaning of the attriute *base* in a context of this API. + +``chunk`` : *number* + The required chunk number for the partitioned tables. + + **Note**: ignored for the *regular* tables. + +``overlap`` : *number* + The required numeric flag indicates a kind of partitioned table (``0`` if this is not the *overlap* + table or any other number of this is the *overlap* table). + + **Note**: ignored for the *regular* tables. + +``charset_name`` : *string* = ``latin1`` + The optional value depends on Qserv settings. + +``binary_encoding`` : *string* = ``hex`` + See :ref:`ingest-general-binary-encoding` for more details. + +``max_num_warnings`` : *number* = ``64`` + The optional limit for the number of notes, warnings, and errors to be retained by MySQL/MariaDB when + loading the contribution into the destination table. + + **Note**: The default number of the limit is determined by a configuration of the ingest services. + The default value of the parameter in MySQL/MariaDB is ``64``. The upper limit for the parameter is ``65535``. + Some workflows may choose to set a specific value for the limit when debugging data of the contributions. + + **TODO**: "Configuration Guide for the Replication/Ingest System" (a reference to the page) + +``rows`` : *array* + The required collection of the data rows to be ingested. Each element of the array represents a complete row, + where elements of the row represent values of the corresponding columns in the table schema: + + .. code-block:: + + [[, ... ], + ... + [, ... ] + ] + + **Note**: + + - The number of elements in each row must be the same as the number of columns in the table schema. + - Positions of the elements within rows should match the positions of the corresponding columns in the table schema. + - see the :ref:`ingest-db-table-management-register-table` section for the details on the table schema. + +The service will return the following JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +.. _ingest-worker-contrib-by-val-csv: + +CSV file +-------- + +.. warning:: + + The service expectes a certain order of the parts in the body of the request. The description of the contribution + request should be posted first, and the file payload should be posted second. There must be exactly one file payload + in the body of the request. No file or many files will be treated as an error and reported as such in the response. + +The following service allows a workflow to push both data (a ``CSV`` file) and a description of the contribution request in +the ``multipart/form-data`` formatted body: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/csv`` + +The body should contain two parts posted in the following order: + +- a collection of the key-value pairs that represent the description of the contribution request +- a single file payload that contains the data to be ingested + +Where the keys which describe the contribution request are presented below: + +``transaction_id`` : *number* + The required unique identifier of a transaction that has to be in the ``STARTED`` state + at a time when a request is received by a service. More information on the transaction management and transaction + states can be found in: :ref:`ingest-trans-management`. + +``table`` : *string* + The required *base* name of a table receiving the contribution. See :ref:`ingest-general-base-table-names` for more details + on the meaning of the attriute *base* in a context of this API. + +``chunk`` : *number* + The required chunk number for the partitioned tables. + + **Note**: ignored for the *regular* tables. + +``overlap`` : *number* + The required numeric flag indicates a kind of partitioned table (``0`` if this is not the *overlap* + table or any other number of this is the *overlap* table). + + **Note**: ignored for the *regular* tables. + +``charset_name`` : *string* = ``latin1`` + The optional parameter that depends on Qserv settings. + +``fields_terminated_by`` : *string* = ``\t`` + The optional parameter of the desired CSV dialect: a character that separates fields in a row. + +``fields_enclosed_by`` : *string* = ``""`` + The optional parameter of the desired CSV dialect: a character that encloses fields in a row. + The default value assumes no quotes around fields. + +``fields_escaped_by`` : *string* = ``\\`` + The optional parameter of the desired CSV dialect: a character that escapes special characters in a field. + The default value assumes two backslash characters. + +``lines_terminated_by`` : *string* = ``\n`` + The default value of the optional parameter assumes the newline character. + +``max_num_warnings`` : *number* = ``64`` + The optional limit for the number of notes, warnings, and errors to be retained by MySQL/MariaDB when + loading the contribution into the destination table. + + **Note**: The default number of the limit is determined by a configuration of the ingest services. + The default value of the parameter in MySQL/MariaDB is ``64``. The upper limit for the parameter is ``65535``. + Some workflows may choose to set a specific value for the limit when debugging data of the contributions. + + **TODO**: "Configuration Guide for the Replication/Ingest System" (a reference to the page) + +The service will return the following JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +Here is an example of how the request could be formatted using ``curl``: + +.. code-block:: bash + + curl http://localhost:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=1630'\ + -F 'table=gaia_source' \ + -F 'chunk=675' \ + -F 'overlap=0' \ + -F 'charset_name=latin1' \ + -F 'fields_terminated_by=,' \ + -F 'max_num_warnings=64' \ + -F 'file=@/path/to/file.csv' + +**Note**: the request header ``-H 'Content-Type: multipart/form-data'`` is not required when using ``curl``. The header +is added here for the sake of clarity. + +Another example is based on Python's ``requests`` library and the ``requests_toolbelt`` package: + +.. code-block:: python + + import requests + from requests_toolbelt.multipart.encoder import MultipartEncoder + import urllib3 + + # Supress the warning about the self-signed certificate + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + url = "http://localhost:25004/ingest/csv" + encoder = MultipartEncoder( + fields = { + "transaction_id": (None, "1630"), + "table": (None, "gaia_source"), + "chunk": (None, "675"), + "overlap": (None, "0"), + "charset_name": (None, "latin1"), + "fields_terminated_by": (None, ","), + "max_num_warnings": (None, "64"), + "file": ("file.csv", open("/path/to/file.csv", "rb"), "text/csv") + } + ) + req = requests.post(url, data=encoder, + headers={"Content-Type": encoder.content_type}, + verify=False) + req.raise_for_status() + res = req.json() + if res["success"] == 0: + error = res["error"] + raise RuntimeError(f"Failed to ingest data ito the table: {error}") + +**Notes**: + +- The ``MultipartEncoder`` class from the ``requests_toolbelt`` package is used for both formatting + the request and sending it in the *streaming* mode. The mode is essential for avoiding memory problem + on the client side when pushing large contributons into the service. W/o the streaming mode the client + will try to load the whole file into memory before sending it to the server. +- The parameter ``verify=False`` is used to ignore SSL certificate verification. Also note using ``urllib3`` + to suppress the certificate-related warning. Do not use this in production code. + +.. _ingest-worker-contrib-get: + +Status of requests +================== + +There are two services in this group. The first one allows retrieving the status info of a single request by +its identifier. The second service is meant for querying statuses of all asynchronous requests of the given transaction. + +.. _ingest-worker-contrib-get-one: + +One request +----------- + +The service allows obtaining a status of the *asynchronous* contribution requests: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/ingest/file-async/:id`` + +The services expect a caller to provide a unique identifier ``id`` of the contribution request in the resource path. +Values of the identifiers are returned by services that accept the contribution requests. + +If the identifier is valid and the service could locate the desired record for the contributon it will return the following +JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +.. _ingest-worker-contrib-get-trans: + +All requests of a transaction +----------------------------- + +The service allows obtaining a status of the *asynchronous* contribution requests submitted in a scope of +a given transaction: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/ingest/file-async/trans/:id`` + +The services expect a caller to provide a unique identifier ``id`` of the transaction in the resource path. +Values of the transaction identifiers are returned by services that manage transactions. See :ref:`ingest-trans-management` +for more details. + +If the identifier is valid and the service could locate the relevant contributons it will will return the following +JSON array: + +.. code-block:: + + { "contribs": [ + , + .. + + ] + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the contribution objects. + + +.. _ingest-worker-contrib-retry: + +Retrying failed contributions +============================= + +.. note:: + + - Services, presented in this section complement those that were meant for the initial submission of the contribution + requests posted by *by-reference*, regardless of the interface used (*synchronous* or *asynchronous*) as documented + in :ref:`ingest-worker-contrib-by-ref`. The eligibility requirememnts for the requests are further explained in: + + - **TODO**: "Automatic retries for the failed contribution requests" (a reference to the page) + + - Unlike the *automatic* retries that may be configured in the original contribution request, + the *explicit* retrying is a responsibility of the ingest workflow. + - The number of the explicit retries is not a subject for limits set for the automatic retries. + It's up to the workflow to decide how many such retries should be attempted. The workflow should coordinate + the retries with the transaction managemnet to avoid the situation when the same request is retried + while the transaction is already in a state that doesn't allow the contribution to be processed. + - The workflow should avoid making multiple parallel requests to retry the same contribution request. + The workflow should be always waiting for the response of the previous retry before making another one. + - The *automatic* retries are disabled by the Ingest service while processing the explicitly made retries. + +Both *synchronous* and *asynchronous* services are provided for the retrying of the failed contributions: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``PUT`` + - | ``/ingest/file/:id`` + | ``/ingest/file-async/:id`` + +The services expect a caller to provide a unique identifier ``id`` of the contribution request to be retried. + +The services will locate and evaluate the specified request to see if it's eligible for retrying. And if it is then +the request will be processed in accordance with the logic of the called service. Specifically: + +- If the *synchronous* interface was invoked then the request will be attempted right away and only once (no further + automatic replies). +- If the alternative *asynchronous* interface was invoked then the request will be placed at the very end of the input + queue. It will be processed in its turn when picked by one of the processing threads of the ingest server's pool. + Likewise, in the case of *synchronous* processing, only one attempt to process the request will be made. + +The service will return the following JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +.. _ingest-worker-contrib-cancel: + +Cancelling async requests +========================= + +.. warning:: + + In general, request cancellation is a non-deterministic operation that is prone to *race conditions*. + An outcome of the cancellation request depends on the current state of a request within the worker service: + + - If the request is still in the wait queue then the cancellation will be successful. + - If the request is already being processed by the ingest machinery then the cancellation will be successful + only if the request is still in the data *reading* state. + - Requests that are already in the *loading* state are presently not cancellable since MySQL table loading + is a non-interruptible operation. + - If the request is already in the *finished* or any form of the *failed* state then obviously no cancellation + will happen. + + The workflow should be always inspect the state of the requests after the cancellation attempts + to make sure that the requests were indeed cancelled. + +There are two services in this group. The first one allows canceling a single request by its identifier. +The second service is meant for cancelling all asyncgronous requests of the given transaction. + +.. _ingest-worker-contrib-cancel-one: + +One request +----------- + +The service allows cancelling an *asynchronous* contribution request: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/ingest/file-async/:id`` + +The services expect a caller to provide a unique identifier ``id`` of the contribution request in the resource path. +Values of the identifiers are returned by services that accept the contribution requests. + +If the identifier is valid and the service could locate the desired record for the contributon it will make an attempt +to cancel it. The service will return the following JSON object: + +.. code-block:: + + { "contrib": { + ... + } + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the response object. + +.. _ingest-worker-contrib-cancel-trans: + +All requests of a transaction +----------------------------- + +The service allows cancelling all *asynchronous* contribution requests submitted in a scope of +a given transaction: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/ingest/file-async/trans/:id`` + +The services expect a caller to provide a unique identifier ``id`` of the corresponding transaction in the resource path. +Values of the transaction identifiers are returned by services that manage transactions. See :ref:`ingest-trans-management` +for more details. + +If the identifier is valid and the service could locate the relevant contributons it will make an attempt +to cancel them. The service will return the following JSON array: + +.. code-block:: + + { "contribs": [ + , + .. + + ] + } + +See the :ref:`ingest-worker-contrib-descriptor` section of document for the details on the schema of the contribution objects. + + +.. _ingest-worker-contrib-descriptor: + +Contribution descriptor +======================= + +The following object illustrates the schema and a sample payload of the contribution descriptor: + +.. code-block:: json + + { + "id" : 2651966, + "async" : 1, + "database" : "gaia_edr3", + "table" : "gaia_source", + "worker" : "db13", + "chunk" : 675, + "overlap" : 0, + "transaction_id" : 1630, + + "status" : "FINISHED", + "create_time" : 1726026383616, + "start_time" : 1726026383619, + "read_time" : 1726026396161, + "load_time" : 1726026412474, + + "url" : "http://sdfqserv001:18080/gaia_edr3/gaia_source/files/chunk_675.txt", + "http_method" : "GET", + "http_headers" : [], + "http_data" : "", + "tmp_file" : "/qserv/data/ingest/gaia_edr3-gaia_source-675-1630-7570-6e63-d0b6-6934.csv", + + "max_num_warnings" : 64, + "max_retries" : 4, + + "charset_name" : "latin1", + "dialect_input" : { + "fields_enclosed_by" : "\\0", + "lines_terminated_by" : "\\n", + "fields_escaped_by" : "\\\\", + "fields_terminated_by" : "," + }, + + "num_bytes" : 793031392, + "num_rows" : 776103, + "num_rows_loaded" : 776103, + + "http_error" : 0, + "error" : "", + "system_error" : 0, + "retry_allowed" : 0, + + "num_warnings" : 0, + "warnings" : [], + "num_failed_retries" : 0, + "failed_retries" : [] + } + + +The most important (for the ingest workflows) attributes of the contribution object are: + +``status`` : *string* + The status of the contribution requests. The possible values are: + + - ``IN_PROGRESS``: The transient state of a request before it's ``FINISHED`` or failed. + - ``CREATE_FAILED``: The request was received and rejected right away (incorrect parameters, etc.). + - ``START_FAILED``: The request couldn't start after being pulled from a queue due to changed conditions. + - ``READ_FAILED``: Reading/preprocessing of the input file failed. + - ``LOAD_FAILED``: Loading into MySQL failed. + - ``CANCELLED``: The request was explicitly cancelled by the ingest workflow (ASYNC contributions only). + - ``FINISHED``: The request succeeded, + +``create_time`` : *number* + The timestamp when the contribution request was received (milliseconds since the UNIX *Epoch*). + A value of the attribute is guaranteed to be not ``0``. + +``start_time`` : *number* + The timestamp when the contribution request was started (milliseconds since the UNIX *Epoch*). + A value of the attribute is ``0`` before the processing starts. + +``read_time`` : *number* + The timestamp when the Ingest service finished reading/preprocessing the input file (milliseconds since the UNIX *Epoch*). + A value of the attribute is ``0`` before the reading starts. + +``load_time`` : *number* + The timestamp when the Ingest service finished loading the contribution into the MySQL table (milliseconds since the UNIX *Epoch*). + A value of the attribute is ``0`` before the loading starts. + +``url`` : *string* + The URL of the input file that was used to create the contribution. Depending on a source of the data, + the URL *scheme* could be: + + - ``http``, ``https``: The file was pulled from a remote Web server. + - ``file``: The file was read from a filesystem that is mounted locally on the corresponding worker. The URL is a full path to the file. + - ``data-json``: The file was sent as a JSON object in the request body. The URL is a placeholder. + - ``data-csv``: The file was sent as a CSV file in the ``multipart/form-data`` formatted body. The URL is a placeholder. + + **Note** that there is no guarantee that the URL will be valid after the contribution is processed. + +``max_num_warnings`` : *number* + The maximum number of the MySQL warnings to be captured after loading the contribution into the MySQL table. + The number may correspond to a value that was explicitly set by workflow when making a contribution request. + Otheriwse the default number configured at the system is assumed. + +``max_retries`` : *number* + The maximum number of retries allowed for the contribution. The number may correspond to a value that was explicitly set by workflow + when making a contribution request. Otheriwse the default number configured at the system is assumed. + +``num_bytes`` : *number* + The total number of bytes in the input file. The value is set by the service after it finishes reading + the file and before it starts loading the data into the MySQL table. + +``num_rows`` : *number* + The total number of rows parsed by the ingest service in the input file. + +``num_rows_loaded`` : *number* + The total number of rows loaded into the MySQL table. Normally the number of rows loaded should be equal to the number of rows parsed. + If the numbers differ it means that some rows were rejected during the ingest process. The workflow should be always monitoring any + mismatches in these values and trigger alerts. + +``http_error`` : *number* + The HTTP error code captured by the service when pulling data of the contribution from a remote Web server. + This applies to the corresponidng URL *schemes*. The value is set only if the error was detected. + +``error`` : *string* + The error message captured by the service during the contribution processing. The value is set only if the error was detected. + +``system_error`` : *number* + The system error code captured by the service during the contribution processing. The value is set only if the error was detected. + +``retry_allowed`` : *number* + The flag that tells if the contribution is allowed to be retried. The value is set by the service when the contribution + processing was failed. The value is set to ``1`` if the contribution is allowed to be retried, and to ``0`` otherwise. + + **Important**: The workflow should be always analyze a value of this attribute to decide if the contribution should be retried. + If the retry is not possible then the workflow should give up on the corresponding transaction, abort the one, and start + another transaction to ingest all contributions attempted in a scope of the aborted one. + +``num_warnings`` : *number* + The total number of MySQL warnings captured after loading the contribution into the MySQL table. + + **Note**: The number is reported correctly regardless if the array in the attribute ``warnings`` + is empty or not. + +``warnings`` : *array* + The array of MySQL warnings captured after loading the contribution into the MySQL table. Each entry is + an object that represents a warning/error/note. See the table in :ref:`ingest-worker-contrib-descriptor-warnings` + for the details on the schema of the object. + + **Notes**: The maximum number of warnings captured is limited by the value of the attribute ``max_num_warnings``. + +``num_failed_retries`` : *number* + The total number of retries that failed during the contribution processing. + + **Note**: The number is reported correctly regardless if the array in the attribute ``failed_retries`` + is empty or not. + +``failed_retries`` : *array* + The array of failed retries captured during the contribution processing. Each such retry is represented + by JSON object that has a schema explained in :ref:`ingest-worker-contrib-descriptor-retries`. + + **Note**: The maximum number of failed retries captured is limited by the value of the attribute ``max_retries``. + +.. _ingest-worker-contrib-descriptor-warnings: + +MySQL warnings +-------------- + +Warnings are captured into the JSON array of ``warnings``: + +.. code-block:: + + "warnings" : [ + , + ... + + ] + +The format of the object is presented below: + +``level`` : *string* + The severity of the warning reported by MySQL. Allowed values: + + - ``Note`` + - ``Warning`` + - ``Error`` + +``code`` : *number* + The numeric error code indicates a reason for the observed problem. + +``message`` : *string* + The human-readable explanation for the problem. + +Here is an example of what could be found in the array: + +.. code-block:: json + + "warnings" : [ + {"code" : 1406, "level" : "Warning", "message" : "Data too long for column 's_region_scisql' at row 3670"}, + {"code" : 1261, "level" : "Warning", "message" : "Row 3670 doesn't contain data for all columns"}, + {"code" : 1406, "level" : "Warning", "message" : "Data too long for column 's_region_scisql' at row 3913"}, + {"code" : 1261, "level" : "Warning", "message" : "Row 3913 doesn't contain data for all columns"}, + {"code" : 1406, "level" : "Warning", "message" : "Data too long for column 's_region_scisql' at row 3918"}, + {"code" : 1261, "level" : "Warning", "message" : "Row 3918 doesn't contain data for all columns"} + ], + +More details on the values can be found in the MySQL documentation: + +- https://dev.mysql.com/doc/refman/8.4/en/show-warnings.html + +.. _ingest-worker-contrib-descriptor-retries: + +Retries +------- + +Retries are captured into the JSON array of ``failed_retries``: + +.. code-block:: + + "failed_retries" : [ + , + ... + + ] + +The format of the object is presented below: + +.. code-block:: + + { "start_time" : , + "read_time" : , + "tmp_file" : , + "num_bytes" : , + "num_rows" : , + "http_error" : , + "system_error" : , + "error" : + } + +Where: + +``start_time`` : *number* + The timestamp when the retry attempt was started (milliseconds since the UNIX *Epoch*). + A value of the attribute is ``0`` before the processing starts. + +``read_time`` : *number* + The timestamp when the Ingest service finished reading/preprocessing the input file (milliseconds since the UNIX *Epoch*). + A value of the attribute is ``0`` before the reading starts. + +``num_bytes`` : *number* + The total number of bytes in the input file. The value is set by the service after it finishes reading + the file and before it starts loading the data into the MySQL table. + +``num_rows`` : *number* + The total number of rows parsed by the ingest service in the input file. + +``http_error`` : *number* + The HTTP error code captured by the service when pulling data of the contribution from a remote Web server. + This applies to the corresponidng URL *schemes*. The value is set only if the error was detected. + +``system_error`` : *number* + The system error code captured by the service during the contribution processing. The value is set only if the error was detected. + +``error`` : *string* + The error message captured by the service during the contribution processing. The value is set only if the error was detected. diff --git a/doc/ingest/api/reference/tools.rst b/doc/ingest/api/reference/tools.rst new file mode 100644 index 0000000000..3642fc5fc3 --- /dev/null +++ b/doc/ingest/api/reference/tools.rst @@ -0,0 +1,17 @@ +###################### +The Command Line Tools +###################### + +Error reporting in the command-line tools +========================================= + +All command line tools return ``0`` to indicate the successful completion of the requested operation. +Other values shall be treated as errors. The error messages are printed to the standard error stream. +Additional information on the error can be found in the standard output stream. + +.. _ingest-tools-qserv-replica-file: + +qserv-replica-file +================== + +TBC. \ No newline at end of file diff --git a/doc/ingest/api/simple/index.rst b/doc/ingest/api/simple/index.rst new file mode 100644 index 0000000000..899ac2b46f --- /dev/null +++ b/doc/ingest/api/simple/index.rst @@ -0,0 +1,648 @@ + +.. _ingest-api-simple: + +======================= +Simple Workflow Example +======================= + +This section provides a practical example of a simple workflow, demonstrating the core API interactions without relying on any automation or +wrapper scripts. Ensure you read the overview before proceeding to the example: + +- :ref:`ingest-api-concepts-overview` + +Test data +--------- + +The data used in this example is a small collection of ready-to-use ``CSV`` files and ``JSON`` configurations +located in the directory ``test101``. The directory is available in the Qserv repository at the following subfolder: + +- `doc_datasets/test101 `_ + +The following configuration and data files are available in the directory: + +.. code-block:: + + database_test101.json + + table_Object.json + table_Source.json + table_Filter.json + + data/ + Filter.csv + Object/ + chunk_7310_overlap.txt + chunk_7480_overlap.txt + chunk_7480.txt + Source/ + chunk_7480.txt + +The JSON files provide definitions for the database and tables: + +``database_test101.json`` + The database ``test101`` to be created + +``table_Object.json`` + The *director* table ``Object`` + +``table_Source.json`` + The simple *dependent* table ``Source`` + +``table_Filter.json`` + The *regular* (fully-replicated) table ``Filter`` + +Data files for these 3 tables are found within the subfolder ``data/``. + +Note that the data is already partitioned and ready to be ingested into the Qserv instance. There are two chunks in +the dataset ``7480`` and ``7310``, where the chunk ``7310`` containes only the chunk *overlap*. A table ``Source`` has +no overlaps since it is a dependent table. Only the *director* tables in Qserv can have overlaps. + +Qserv setup +----------- + +There are a few options for setting up the Qserv instance and the database. This example will be using an existing Kubernetes-based Qserv +instance. For the sake of simplicity, all interactins with the Ingest API will be done using the ``curl`` command from inside +the Replication Controller pod. + +.. code-block: bash + + kubectl exec -it qserv-repl-ctl-0 -- /bin/bash + +Ater logging into the pod, one can pull the files form the repository into the container: + +.. code-block: bash + + cd /home/qserv + mkdir test101 + cd test101 + for file in database_test101.json table_Object.json table_Source.json table_Filter.json; do + curl -O https://raw.githubusercontent.com/lsst/qserv/tickets/DM-46111/doc_datasets/test101/data/$file + done + mkdir data + curl -O https://raw.githubusercontent.com/lsst/qserv/tickets/DM-46111/doc_datasets/test101/data/Filter.csv + mkdir Object + cd Object + for file in chunk_7310_overlap.txt chunk_7480_overlap.txt chunk_7480.txt; do + curl -O https://raw.githubusercontent.com/lsst/qserv/tickets/DM-46111/doc_datasets/test101/data/Object/$file + done + cd .. + mkdir Sourcee + cd Source + curl -O https://raw.githubusercontent.com/lsst/qserv/tickets/DM-46111/doc_datasets/test101/data/Source/chunk_7480.txt + cd .. + cd .. + +The next test is to ensure that the Replication Controller server is running. The server should respond to requests send to the following +service: + +.. code-block: bash + + curl http://localhost:8080/meta/version + +The response should be a JSON object explained in: + +- :ref:`ingest-general-versioning` + +Register the database and tables +-------------------------------- + +Register the database: + +.. code-block: bash + + curl http://localhost:8080/ingest/database -X POST -H "Content-Type: application/json" -d @database_test101.json + +Register the tables: + +.. code-block:: bash + + curl http://localhost:8080/ingest/table -X POST -H "Content-Type: application/json" -d @table_Object.json + curl http://localhost:8080/ingest/table -X POST -H "Content-Type: application/json" -d @table_Source.json + curl http://localhost:8080/ingest/table -X POST -H "Content-Type: application/json" -d @table_Filter.json + +Start the transaction +--------------------- + +Start the transaction: + +.. code-block:: bash + + curl http://localhost:8080/ingest/trans \ + -X POST -H "Content-Type: application/json" \ + -d '{"database": "test101","auth_key": ""}' + +The response should be a JSON object with the transaction ID. The transaction ID is needed for all subsequent +requests (insignificant parts of the response are omitted): + +.. code-block:: json + + { "success" : 1, + "databases" : { + "test101" : { + "transactions" : [ + { + "id" : 84, + "state" : "STARTED" + "begin_time" : 1730139963298, + "start_time" : 1730139963367, + "end_time" : 0, + } + ] + } + } + } + +Get locations of the workers for ingesting the regular table Filter +------------------------------------------------------------------- + +.. code-block:: bash + + curl http://localhost:8080/ingest/regular \ + -X GET -H "Content-Type: application/json" -d '{"transaction_id":84}' + +The service returns a JSON object with the locations of the workers for ingesting the regular tables: + +.. code-block:: json + + { "locations" : [ + + { "worker" : "qserv-worker-0", + "host" : "10.141.0.44", + "host_name" : "qserv-worker-0.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host_name" : "qserv-worker-0.qserv-worker.default.svc.cluster.local", + "http_host" : "10.141.0.44", + "http_port" : 25004 + }, + { "worker" : "qserv-worker-1", + "host" : "10.141.7.33", + "host_name" : "qserv-worker-1.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.7.33", + "http_host_name" : "qserv-worker-1.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + }, + { "worker" : "qserv-worker-2", + "host" : "10.141.2.45", + "host_name" : "qserv-worker-2.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.2.45", + "http_host_name" : "qserv-worker-2.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + }, + { "worker" : "qserv-worker-3", + "host" : "10.141.4.37", + "host_name" : "qserv-worker-3.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.4.37", + "http_host_name" : "qserv-worker-3.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + }, + { "worker" : "qserv-worker-4", + "host" : "10.141.6.37", + "host_name" : "qserv-worker-4.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.6.37", + "http_host_name" : "qserv-worker-4.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + } + ] + } + +According to the response, the data of the *regular* table ``Filter`` have to be pushed to the folowing worker hosts (using +the FQDNs of the hosts): + +.. code-block: + + qserv-worker-0.qserv-worker.default.svc.cluster.local + qserv-worker-1.qserv-worker.default.svc.cluster.local + qserv-worker-2.qserv-worker.default.svc.cluster.local + qserv-worker-3.qserv-worker.default.svc.cluster.local + qserv-worker-4.qserv-worker.default.svc.cluster.local + +Where the port numbers are: + +- ``25002`` for the binary protocol +- ``25004`` for the HTTP protocol + +The next section will be presenting examples for ingesting the data using both protocols. + +Get locations of the chunks 7310 and 7480 +----------------------------------------- + +For chunk ``7310``: + +.. code-block:: bash + + curl http://localhost:8080/ingest/chunk \ + -X POST -H "Content-Type: application/json" -d '{"transaction_id":84,"chunk":7310,"auth_key":""}' + +.. code-block:: json + + { "location" : { + "worker" : "qserv-worker-2", + "host" : "10.141.2.45", + "host_name" : "qserv-worker-2.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.2.45", + "http_host_name" : "qserv-worker-2.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + } + } + +For chunk ``7480``: + +.. code-block:: bash + + curl http://localhost:8080/ingest/chunk \ + -X POST -H "Content-Type: application/json" -d '{"transaction_id":84,"chunk":7480,"auth_key":""}' + +.. code-block:: json + + { "location" : { + "worker" : "qserv-worker-3", + "host" : "10.141.4.37", + "host_name" : "qserv-worker-3.qserv-worker.default.svc.cluster.local", + "port" : 25002, + "http_host" : "10.141.4.37", + "http_host_name" : "qserv-worker-3.qserv-worker.default.svc.cluster.local", + "http_port" : 25004 + } + } + +The following map shows the endpoints for ingesting the chunks over the proprietary binary protocol: + +- ``7310``: ``qserv-worker-2.qserv-worker.default.svc.cluster.local:25002`` +- ``7480``: ``qserv-worker-3.qserv-worker.default.svc.cluster.local:25002`` + +The endpoints for the HTTP-based protocol are presented below: + +- ``7310``: ``qserv-worker-2.qserv-worker.default.svc.cluster.local:25004`` +- ``7480``: ``qserv-worker-3.qserv-worker.default.svc.cluster.local:25004`` + + +.. hint: + + Both ``host`` (``http_host``) abd ``host_name`` (``http_host_name``) addresses are provided in the response. + The former is the IP address. The latter is the fully qualified domain name (FQDN) of the worker host. + It's recommended to use the FQDN in the Kubernetes-based Qserv deployment where IP addresses of the worker + pods are not stable. This suggestion applies to both the binary and HTTP protocols, and to any table type. + +Ingest the data +--------------- + +Two alternative options for ingesting the data are presented in this section. The first option is to ingest the data +via the proprietary binary protocol using the following tool: + +- :ref:`ingest-tools-qserv-replica-file` (TOOLS) + +The second technique is to push the data via the HTTP protocol using the following service: + +- :ref:`ingest-worker-contrib-by-val` (REST) + +Ingest the data using the binary protocol +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following command will ingest the data of the *regular* table ``Filter`` using the binary protocol: + +.. code-block:: bash + + PORT=25002 + TRANS=84 + TABLE_TYPE="R" + + mkdir -p logs + + for idx in $(seq 0 4); do + WORKER_HOST="qserv-worker-${idx}.qserv-worker.default.svc.cluster.local"; + qserv-replica-file INGEST FILE \ + ${WORKER_HOST} \ + ${PORT} \ + ${TRANS} \ + "Filter" \ + ${TABLE_TYPE} \ + data/Filter.csv \ + --verbose >& logs/Filter_${WORKER_HOST_IDX}.log; + done + +Note the flag ``--verbose`` which will print the summary of the ingestion request. The logs will be saved in +the directory ``logs/``. Here is an example of the output found in ``logs/Filter_0.log``: + +.. code-block:: + + Id: 1728535 + Ingest service location: qserv-worker-0.qserv-worker.default.svc.cluster.local:25002 + Transaction identifier: 84 + Destination table: Filter + Chunk: 0 + Is chunk overlap: 0 + Input file name: data/Filter.csv + Start time: 2024-10-28 20:14:55.922 + Finish time: 2024-10-28 20:14:55.945 + Elapsed time: 0 sec + Bytes sent: 75 + MByte/sec: -nan + Number of warnings: 0 + Number of rows parsed: 9 + Number of rows loaded: 9 + +Now ingest the data of the *partitioned* tables ``Object`` and ``Source``: + +.. code-block:: bash + + PORT=25002 + TRANS=84 + TABLE_TYPE="P" + + qserv-replica-file INGEST FILE \ + "qserv-worker-2.qserv-worker.default.svc.cluster.local" \ + ${PORT} \ + ${TRANS} \ + "Object" \ + ${TABLE_TYPE} \ + data/Object/chunk_7310_overlap.txt \ + --fields-terminated-by=',' \ + --verbose >& logs/Object_chunk_7310_overlap.log + + qserv-replica-file INGEST FILE \ + "qserv-worker-3.qserv-worker.default.svc.cluster.local" \ + ${PORT} \ + ${TRANS} \ + "Object" \ + ${TABLE_TYPE} \ + data/Object/chunk_7480_overlap.txt \ + --fields-terminated-by=',' \ + --verbose >& logs/Object_chunk_7480_overlap.log + + qserv-replica-file INGEST FILE \ + "qserv-worker-3.qserv-worker.default.svc.cluster.local" \ + ${PORT} \ + ${TRANS} \ + "Object" \ + ${TABLE_TYPE} \ + data/Object/chunk_7480.txt \ + --fields-terminated-by=',' \ + --verbose >& logs/Object_chunk_7480.log + + qserv-replica-file INGEST FILE \ + "qserv-worker-3.qserv-worker.default.svc.cluster.local" \ + ${PORT} \ + ${TRANS} \ + "Source" \ + ${TABLE_TYPE} \ + data/Source/chunk_7480.txt \ + --fields-terminated-by=',' \ + --verbose >& logs/Source_chunk_7480.log + +Push the data to workers via the HTTP protocol +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. hint:: + + The ``curl`` command is used to push the data to the workers. The worker services return responses in JSON format. + In the examples presented below, the response objects are stored in files with the extension ``.json``. + The corresponding option is ``-o logs/.json``. Always evaluate the response object to ensure the operation + was successful. + + +The following command will ingest the data of the *regular* table ``Filter`` using the HTTP protocol: + +.. code-block:: bash + + mkdir -p logs + + for idx in $(seq 0 4); do + WORKER_HOST="qserv-worker-${idx}.qserv-worker.default.svc.cluster.local"; + + curl http://${WORKER_HOST}:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=84'\ + -F 'table=Filter' \ + -F 'file=@data/Filter.csv' \ + -o logs/Filter_${WORKER_HOST_IDX}.json \ + >& logs/Filter_${WORKER_HOST_IDX}.log; + done + +Now ingest the data of the *partitioned* tables ``Object`` and ``Source``: + +.. code-block:: bash + + curl http://qserv-worker-2.qserv-worker.default.svc.cluster.local:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=84'\ + -F 'table=Object' \ + -F 'chunk=7310' \ + -F 'overlap=1' \ + -F 'fields_terminated_by=,' \ + -F 'file=@data/Object/chunk_7310_overlap.txt' \ + -o logs/logs/Object_chunk_7310_overlap.json \ + >& logs/logs/Object_chunk_7310_overlap.log + + curl http://qserv-worker-3.qserv-worker.default.svc.cluster.local:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=84'\ + -F 'table=Object' \ + -F 'chunk=7480' \ + -F 'overlap=1' \ + -F 'fields_terminated_by=,' \ + -F 'file=@data/Object/chunk_7480_overlap.txt' \ + -o logs/logs/Object_chunk_7480_overlap.json \ + >& logs/logs/Object_chunk_7480_overlap.log + + curl http://qserv-worker-3.qserv-worker.default.svc.cluster.local:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=84'\ + -F 'table=Object' \ + -F 'chunk=7480' \ + -F 'overlap=0' \ + -F 'fields_terminated_by=,' \ + -F 'file=@data/Object/chunk_7480.txt' \ + -o logs/logs/Object_chunk_7480.json \ + >& logs/logs/Object_chunk_7480.log + + curl http://qserv-worker-3.qserv-worker.default.svc.cluster.local:25004/ingest/csv \ + -X POST -H 'Content-Type: multipart/form-data' \ + -F 'transaction_id=84'\ + -F 'table=Source' \ + -F 'chunk=7480' \ + -F 'overlap=0' \ + -F 'fields_terminated_by=,' \ + -F 'file=@data/Source/chunk_7480.txt' \ + -o logs/logs/Source_chunk_7480.json \ + >& logs/logs/Source_chunk_7480.log + +Note that the last 4 commands are overridoimg the default field terminator ``\t`` with the comma ``','``. + +Commit the transaction +---------------------- + +.. code-block:: bash + + curl 'http://localhost:8080/ingest/trans/84?abort=0' \ + -X PUT -H "Content-Type: application/json" \ + -d '{"auth_key": ""}' + +This is a synchronous operation. The response will be a JSON object with the status of the operation. If the response object +contains the key ``success`` with the value ``1``, the operation was successful. The workflow may also check the status of +the transaction by making the following request: + +.. code-block:: bash + + curl 'http://localhost:8080/ingest/database/test101' \ + -X GET -H "Content-Type: application/json" + +The response object will contain the status of the transaction. If the transaction is in the ``FINISHED`` state, the +transaction was successful: + +.. code-block:: json + + { "databases" : { + "test101" : { + "is_published" : 0, + "num_chunks" : 2, + "transactions" : [ + { + "id" : 84, + "database" : "test101", + "state" : "FINISHED", + "begin_time" : 1730139963298, + "start_time" : 1730139963367, + "end_time" : 1730156228946, + "transition_time" : 1730156228374, + "context" : {}, + "log" : [] + } + ] + } + }, + "success" : 1, + "error" : "", + "error_ext" : {}, + "warning" : "No version number was provided in the request's query.", + } + +Publish the database +--------------------- + +.. code-block:: bash + + curl 'http://localhost:8080/ingest/database/test101' \ + -X PUT -H "Content-Type: application/json" \ + -d '{"auth_key": ""}' + +This is a synchronous operation. The response will be a JSON object with the status of the operation (truncated for brevity +to the key ``success``): + +.. code-block:: json + + { "success" : 1 + } + +The database is now published and ready for queries. + +Test the catalog +---------------- + +This can be done by running a few simple queries via the ``mysql`` client: + +.. code-block:: bash + + kubectl exec -it qserv-czar-0 -c proxy -- \ + mysql --protocol=tcp -hlocalhost -P4040 -uqsmaster test101 + +This will open a MySQL client connected to the database ``test101``. The following queries can be run to test the catalog: + +.. code-block:: sql + + SELECT * FROM Filter; + +----------------+----------+------------+----------+--------+ + | qserv_trans_id | filterId | filterName | photClam | photBW | + +----------------+----------+------------+----------+--------+ + | 84 | 0 | u | 0 | 0 | + | 84 | 1 | g | 0 | 0 | + | 84 | 2 | r | 0 | 0 | + | 84 | 3 | i | 0 | 0 | + | 84 | 4 | z | 0 | 0 | + | 84 | 5 | y | 0 | 0 | + | 84 | 6 | w | 0 | 0 | + | 84 | 7 | V | 0 | 0 | + +----------------+----------+------------+----------+--------+ + +.. code-block:: sql + + SELECT COUNT(*) FROM Object; + +----------+ + | COUNT(*) | + +----------+ + | 1000 | + +----------+ + +.. code-block:: sql + + SELECT COUNT(*) FROM Source; + +----------+ + | COUNT(*) | + +----------+ + | 4583 | + +----------+ + +.. code-block:: + + SELECT * FROM Object LIMIT 1\G + *************************** 1. row *************************** + qserv_trans_id: 84 + objectId: 433327840428745 + iauId: NULL + ra_PS: 1.30450574307 + ra_PS_Sigma: 0.0000153903 + decl_PS: 3.34239540723 + decl_PS_Sigma: 0.0000166903 + radecl_PS_Cov: 0.00000000162187 + ra_SG: 1.30451383451 + ra_SG_Sigma: 0.000135688 + decl_SG: 3.34239574427 + decl_SG_Sigma: 0.000145373 + radecl_SG_Cov: -0.00000000107427 + raRange: NULL + declRange: NULL + ... + +**Note** the MySQL-specific syntax for the query ``\G``. The ``\G`` is a MySQL-specific command that formats the output +of the query in a more readable way. The output is presented in a vertical format, where each row is presented on a separate +line. The columns are presented in the format ``column_name: value``. + +.. code-block:: sql + + SELECT objectId FROM Object LIMIT 10; + +-----------------+ + | objectId | + +-----------------+ + | 433327840428745 | + | 433327840428744 | + | 433327840428743 | + | 433327840428742 | + | 433327840428741 | + | 433327840428740 | + | 433327840428739 | + | 433327840428746 | + | 433327840428747 | + | 433327840428748 | + +-----------------+ + 10 rows in set (0.07 sec) + +.. code-block:: sql + + SELECT objectId,decl_PS,ra_PS FROM Object WHERE objectId=433327840428739; + +-----------------+---------------+---------------+ + | objectId | decl_PS | ra_PS | + +-----------------+---------------+---------------+ + | 433327840428739 | 3.33619102281 | 1.29801680549 | + +-----------------+---------------+---------------+ + +Post-ingest operations +---------------------- + +The database is now ready for queries. However, the following operations can be performed: + +- :ref:`ingest-api-post-ingest` + diff --git a/doc/ingest/data/index.rst b/doc/ingest/data/index.rst new file mode 100644 index 0000000000..87b95bc924 --- /dev/null +++ b/doc/ingest/data/index.rst @@ -0,0 +1,13 @@ + +.. _ingest-data: + +Data preparation +================ + +.. toctree:: + :maxdepth: 2 + + intro + partitioning + partitioner + ref-match diff --git a/doc/ingest/data/intro.rst b/doc/ingest/data/intro.rst new file mode 100644 index 0000000000..1aeb993d1d --- /dev/null +++ b/doc/ingest/data/intro.rst @@ -0,0 +1,109 @@ +.. _ingest-data-intro: + +Introduction +============ + +This document presents the requirements for data to be ingested into Qserv and describes the process of preparing +data for ingestion. + +The main data format supported by Qserv is ``CSV``. This choice is motivated by the fact that Qserv is implemented on top of MySQL, +which natively supports the ``CSV`` format. The ``CSV`` format is also very simple and can be easily generated from other +formats like ``Parquet``. Another reason for choosing the ``CSV`` format is the performance of the ``LOAD DATA INFILE`` statement +used to load data into MySQL. The ``LOAD DATA INFILE`` statement is the fastest way to load data into MySQL and is optimized for +the ``CSV`` format. More information on this subject can be found in: + +- `LOAD DATA INFILE `_ (MySQL) + +The second option for loading data into Qserv is to send the data packaged in the ``JSON`` format. This option is not as efficient as the +``CSV`` format but is more flexible and can be used for loading data into Qserv in a more complex way. The ``JSON`` format is supported by +the following service: + +- :ref:`ingest-worker-contrib-by-val-json` + +This format is also used by the simple ingest API documented in :ref:`http-frontend`. The API is meant for ingesting and managing user +tables which are relatively small in size. + +The data preparation process includes 4 steps: + +1. Data conversion +2. Partitioning +3. Post-processing (data sanitization) +4. Staging + +Some steps may be optional depending on the data format and the requirements of the ingest workflow. + +Data conversion +--------------- + +.. note:: + + This section is not complete. More details on the data conversion process will be added in the future + version of the document. + +The data conversion step is required if the input data is not in the ``CSV`` format. The ost common data format +used in the LSST project is the ``Parquet`` format. + +Partitioning +------------ + +This topic is covered in: + +- :ref:`ingest-data-partitioning` (DATA) +- :ref:`ingest-data-partitioner` (DATA) +- :ref:`ingest-data-partitioning-ref-match` (DATA) + + +Post-processing +--------------- + +Besides converting the data to the CSV format, there are other operations that may optionally be performed on +the input data. The purpose of these operations is to ensure the values of the columns are compatible with +MySQL expectations. These are a few examples of what may need to be done: + +- The ``BOOL`` type in MySQL maps to the ``TINYINT`` type in MySQL. Because of that, values like ``true`` + and ``false`` are not supported by MySQL. Hence, they need to be converted to ``1`` and ``0`` respectively. + +- Some data tools may produce ``-inf`` and ``+inf`` values when converting floating point numbers + into the ``CSV`` format. Neither of these values are supported by MySQL. Assuming the column type is ``REAL``, + they need to be converted to ``-1.7976931348623157E+308`` and ``1.7976931348623157E+308`` respectively. + +- The ``NULL`` values in the ``CSV`` files need to be converted into ``\N``. + +Handling the binary data +------------------------ + +The binary data is supported by the Qserv ingest system in two ways: + +- The ``CSV`` format supports the binary data. The coresponidng fields need to be properly escaped + as explained in: + + - `LOAD DATA INFILE `_ (MySQL) + +- The ``JSON`` format also supports the binary data. However, the data in the correspondin columns need + to be encodeded as explained in: + + - :ref:`ingest-general-binary-encoding` (API) + + +Restrictions for the variable-length column types +------------------------------------------------- + +Note that variable-length column types like ``VARCHAR`` and ``TEXT`` are not allowed in the *director* tables in Qserv. +This is because *director* tables are used to materialize sub-chunks of the data. Sub-chunks are the smallest units of +data that can be processed by Qserv workers. The sub-chunk tables are implemented using the ``MEMORY`` storage engine. +Further details on this subject can be found in: + +- `MEMORY Storage Engine `_ (MySQL) + +Staging +------- + +Once the data are converted and partitioned, they need to be staged at a location from where they can be loaded into Qserv. +Depending on the selected ingest method, the data may be: + +- placed locally, from where they would be pushed into Qserv via the proprietary binary protocol or the REST API. +- placed on a distributed filesystem like ``GPFS``, ``Lustre``, etc., which is mounted at the Qserv workers. +- placed on a Web server, from where they could be pulled into Qserv via the HTTP/HTTPS protocol. +- placed into an Object Store (S3 compatible), from where they could be pulled into Qserv via the S3 protocol. + +Besides availability, the workflow may also require the data to be retained until the ingest process is completed. diff --git a/doc/ingest/data/partitioner.rst b/doc/ingest/data/partitioner.rst new file mode 100644 index 0000000000..037be78084 --- /dev/null +++ b/doc/ingest/data/partitioner.rst @@ -0,0 +1,171 @@ +.. _ingest-data-partitioner: + +Partitioner +=========== + +Summary +------- + +Qserv's partitioners are implemented as standalone C++ programs +(``sph-partition`` or ``sph-partition-matches``) that separate input rows from one +or more files and write them into another set of files named according to their +partition numbers. + +``sph-partition[-matches]`` takes as input a set of CSV files, and based on a +specified partition configuration, produces output CSV files (one per partition). +Each partition is called a "chunk" and contains subpartitions ("subchunks"), +although a partition's subpartitions are stored together in a file (and table). + +Tables that are partitioned in Qserv must be partitioned identically within a +Qserv database. + +This means that tables in a database share identical partition +boundaries and identical mappings of ``chunk ID spatial partition``. In order to +facilitate table joining, a single table's columns are chosen to define the +partitioning space and all partitioned tables (within a database) are +partitioned according that pair of columns. Our current plan chooses the +``Object`` table's ``ra_PS`` and ``decl_PS`` columns, meaning that rows in the +``Source`` and ``ForcedSource`` tables would be partitioned according to the +``Object`` they reference. + +However, there is a wrinkle: prototype pipeline development involves comparing +pipeline outputs to a reference catalog, which might correspond to an +independent processing of the input data, or to a ground truth catalog from +which raw pipeline input images were synthesized. Such tables (``SimRefObject`` +for example) share the same partitioning parameters as ```Object```. But, there +is no obvious 1 to 1 mapping between reference objects and objects. Instead, +the pipelines compute a (spatial) match table between the two that provides +a many-to-many relationship between both tables. The complication is that a +match record might reference an object and reference object that fall on +opposite sides of a partition boundary. Qserv deals with this by taking +advantage of the overlap that must be stored alongside each partition (this +overlap is stored so that Qserv can avoid inter-worker communication when +performing spatial joins on the fly). + +Requirements +------------ + +The Qserv partitioning code must be capable of processing: + +- CSV-format files +- files that are very large +- large quantities of files +- tables containing both positions and positional match pairs + +It must be possible to reorder and drop input columns. The output format shall +be CSV suitable for loading into the database instances running on Qserv worker +nodes. + +The following features are desirable, but have yet to be implemented: + +- It should be possible to process FITS table files. + +- It should be possible to lookup the sky-positions of entities in a + record by key. Currently, positions must be present in the records fed to the + partitioner (but can be dropped from the output). Note that it must be + possible to fall-back to another position in case the key is ``NULL``. Such + orphaned entities can be produced if for example the association pipeline + doesn't necessarily associate each ``Source`` to an ``Object`` (some ``Source`` + rows might be considered to be spurious). + +- Producing partitioned output tables in native database format (e.g. + directly producing ``.MYD`` files for the MySQL MyISAM storage engine) + may significantly speed up the loading process. + +Design +------ + +The partitioner is expressed in the map-reduce paradigm. The map function +operates on one input record at a time. It computes the partition(s) +the input record must be assigned to and emits corresponding output +records. Output records are grouped by chunk and passed on to reducers, +which collect statistics on how many records are in each sub-chunk and +write output records to disk. This is all implemented in C++, using a small +in-memory map-reduce processing framework tailored to the task. The framework +is multi-threaded - file input, processing, and output are all block oriented +and parallel (see ``src/MapReduce.h`` in the source tree for details). + +As a result of using this model, the code that deals with parallelization is +separated from the partitioning logic, making the implementation easier to +follow. Another benefit is that porting to the Hadoop framework would at +least be conceptually straightforward. + +Usage +----- + +Here is a fully worked single node example for the PT1.2 ``Object``, +``SimRefObject``, and ``RefObjMatch`` tables. First, the tables are dumped +in TSV format: + +.. code-block:: bash + + mysql -A \ + -h lsst10.ncsa.illinois.edu \ + -D rplante_PT1_2_u_pt12prod_im2000 \ + -B --quick --disable-column-names \ + -e "SELECT * FROM Object;" > Object.tsv + + mysql -A \ + -h lsst10.ncsa.illinois.edu \ + -D rplante_PT1_2_u_pt12prod_im2000 \ + -B --quick --disable-column-names \ + -e "SELECT * FROM SimRefObject;" > SimRefObject.tsv + +For the`RefObjMatch` table, we'll need to pull in the positions of the objects +and reference objects being matched up, so that the partitioner can determine +which partition each side of the match has been assigned to. That can be done via: + +.. code-block:: sql + + SELECT m.*, r.ra, r.decl, o.ra_PS, o.decl_PS + FROM RefObjMatch AS m LEFT JOIN + SimRefObject AS r ON (m.refObjectId = r.refObjectId) LEFT JOIN + Object AS o ON (m.objectId = o.objectId); + +Note the left joins - either side of a match pair can be NULL, indicating an +unmatched object or reference object. + +Assuming ``QSERV_DIR`` has been set to a Qserv install or checkout location, +the ``Source`` and ``Object`` tables can now be partitioned as follows: + +.. code-block:: bash + + CFG_DIR=$QSERV_DIR/admin/dupr/config/PT1.2 + + for TABLE in Object Source; do + sph-partition \ + --config-file=$CFG_DIR/$TABLE.json \ + --config-file=$CFG_DIR/common.json \ + --in.csv.null=NULL \ + --in.csv.delimiter=$'\t' \ + --in.csv.escape=\\ \ + --in.csv.quote=\" \ + --in.path=$TABLE.tsv \ + --verbose \ + --mr.num-workers=6 --mr.pool-size=32768 --mr.block-size=16 \ + --out.dir=chunks/$TABLE + done + + +The matches can be partitioned using: + +.. code-block:: bash + + sph-partition-matches \ + --config-file=$CFG_DIR/RefObjMatch.json \ + --config-file=$CFG_DIR/common.json \ + --in.csv.null=NULL \ + --in.csv.delimiter=$'\t' \ + --in.csv.escape=\\ \ + --in.csv.quote=\" \ + --in.path=RefObjMatch.tsv \ + --verbose \ + --mr.num-workers=6 --mr.pool-size=32768 --mr.block-size=16 \ + --out.num-nodes=1 --out.dir=chunks/RefObjMatch + +Output chunk files are stored in the directory specified by ```--out.dir```, +and can subsequently be distributed and loaded into the MySQL databases +on worker nodes. Examine the config files referenced above and run +``sph-partition --help`` or ```sph-partition-matches --help``` for more +information on partitioning parameters. + diff --git a/doc/ingest/data/partitioning.rst b/doc/ingest/data/partitioning.rst new file mode 100644 index 0000000000..696f313b1e --- /dev/null +++ b/doc/ingest/data/partitioning.rst @@ -0,0 +1,204 @@ +.. _ingest-data-partitioning: + +Partitioning +============ + +Overview +-------- + +The largest LSST tables will not fit on a single node in the projected +production time-frame, and so must be partitioned into smaller pieces +and spread over many nodes. Qserv uses a spatial partitioning scheme. A +dominant table is chosen containing sky coordinates that are used for +partitioning; for LSST data releases, this is the ``Object`` table. All other +tables that contain a foreign key into the ``Object`` table are partitioned by +the position of the associated ``Object``, or by a fall-back position if an +associated ``Object`` is not guaranteed to have been assigned. This means for +example that all the single-exposure sources associated with a particular +``Object`` will end up in the same partition, even if that ``Object`` has +non-trivial proper motion. Large tables (e.g. catalogs from other surveys) +that are not directly related to ``Object`` are also partitioned by position, +using the same scheme and parameters. This results in perfectly aligned +partition boundaries, simplifying the implementation of near-neighbor joins. +The following excerpt from +`Qserv: A distributed shared-nothing database for the LSST catalog `_ +further describes some of the motivations for partitioning data in Qserv: + + Qserv divides data into spatial partitions of roughly the same area. Since + objects occur at a similar density (within an order of magnitude) throughout + the celestial sphere, equal-area partitions should evenly spread a load that + is uniformly distributed over the sky. If partitions are small with respect + to higher-density areas and spread over computational resources in a + non-area-based scheme, density-differential-induced skew will be spread among + multiple nodes. + + Determining the size and number of data partitions may not be obvious. + Queries are fragmented according to partitions so an increasing number of + partitions increases the number of physical queries to be dispatched, + managed, and aggregated. Thus a greater number of partitions increases the + potential for parallelism but also increases the overhead. For a + data-intensive and bandwidth-limited query, a parallelization width close + to the number of disk spindles should minimize seeks while maximizing + bandwidth and performance. + + From a management perspective, more partitions facilitate re-balancing data + among nodes when nodes are added or removed. If the number of partitions + were equal to the number of nodes, then the addition of a new node would + require the data to be re-partitioned. On the other hand, if there were many + more partitions than nodes, then a set of partitions could be assigned to + the new node without re-computing partition boundaries. + + Smaller and more numerous partitions benefit spatial joins. In an + astronomical context, we are interested in objects near other objects, + and thus a full O(n^2) join is not required — a localized spatial join + is more appropriate. With spatial data split into smaller partitions, + a SQL engine computing the join need not even consider (and reject) all + possible pairs of objects, merely all the pairs within a region. Thus a + task that is naively O(n^2) becomes O(kn) where k is the number of objects + in a partition. + + In consideration of these trade-offs, two-level partitioning seems to be a + conceptually simple way to blend the advantages of both extremes. Queries + can be fragmented in terms of coarse partitions (“chunks”), and spatial + near-neighbor joins can be executed over more fine partitions (“subchunks”) + within each partition. To avoid the overhead of the subchunks for non-join + queries, the system can store chunks and generate subchunks on-demand for + spatial join queries. On-the-fly generation for joins is cost-effective + due to the drastic reduction of pairs, which is true as long as there are + many subchunks for each chunk. + +Note that if each point is assigned to exactly one partition, then a spatial join +cannot operate on a single partition at a time because two nearby points could +belong to different partitions. Dealing with outside partitions requires +data exchange, complicating the Qserv implementation. To avoid this, Qserv +stores a precomputed amount of overlapping data along with each partition. + +The Partitioning Scheme +----------------------- + +The celestial sphere is divided into latitude angle “stripes” of fixed height H. +For each stripe, a width W is computed such that any two points in the stripe +with longitude angles separated by at least W have angular separation of +at least H. The stripe is then broken into an integral number of chunks of +width at least W. The same procedure is used to obtain finer subchunks - +each stripe is broken into a configureable number of equal-height “substripes”, +and each substripe is broken into equal-width subchunks. This scheme was chosen +over e.g. the Hierarchical Triangular Mesh for its speed (no trigonometry is +required to locate the partition of a point given in spherical coordinates), +simplicity of implementation, and the fine control it offers over the area of +chunks and sub-chunks. + +Partition Identifiers +--------------------- + +Partition identifiers are integers. No assumptions about the properties +of these IDs (e.g. consecutiveness, non-negativity, etc... ) should be made +outside of the partitioning code itself. For the record, here is how these +identifiers are currently assigned. + +First, stripes are numbered from 0 to S - 1, where 0 denotes the southernmost +stripe. Within a stripe s, chunks are numbered from 0 to C_s - 1, where 0 +denotes the chunk with minimum longitude = 0. Note that according to the rules +above, 2S >= C_s for any s. The identifier (chunk ID) for chunk c in stripe s +is given by: + + 2S*s + c + +Similarly, the identifier (subchunk ID) for subchunk c in substripe s within a +particular chunk is: + + M*s + c + +where M is the maximum number of sub-chunks that fit in a single sub-stripe of +any chunk. + +Overlap +------- + +The boundaries of chunks and subchunks constructed as described are boxes in +longitude and latitude angle. It is relatively straightforward to compute +a spherical box that is a conservative approximation to the region containing +all points within angular separation R of an initial box - the latitude +limits must be adjusted outwards by R, and longitude bounds for the +shape obtained by sweeping a small circle of opening angle R along the sides +of the box must be computed. + +.. image:: /_static/subchunks.png + :target: ../../../_images/subchunks.png + :alt: Subchunks + +In the image above, the white square on the right is a subchunk. The +overlap region of this subchunk consists of the light blue +region around it. On the left, a tiling of multiple subchunks +is shown. + +Qserv implements a spatial join that finds objects in two distinct tables +U and V separated by an angle of at most R by taking the union of the +following over all subchunks: + +.. code-block:: sql + + ( + SELECT ... + FROM U_p, V_p + WHERE scisql_angSep(U_p.ra, U_p.decl, V_p.ra, V_p.decl) <= R AND ... + ) UNION ALL ( + SELECT ... + FROM U_p, OV_p + WHERE scisql_angSep(U_p.ra, U_p.decl, V_p.ra, V_p.decl) <= R AND ... + ) + +Here, U_p and V_p correspond to the subchunk p of U and V (which must be +identically partitioned), and OV_p contains the points inside the overlap +region of p. + +Match Tables +------------ + +Match tables store a precomputed many-to-many relationship between two +identically partitioned tables U and V. For example, a match table might +store a mapping between reference objects and objects that facilitates +data production QA. + +A row in a match table M consists of (at least) a pair of foreign keys +into the two related tables. A match in M is assigned to a subchunk +P if either of the referenced positions to is assigned to P. If no positions +in a match are separated by more than the overlap radius, then a 3-way +equi-join between U, M and V can be decomposed into the union of +3-way joins over the set of subchunks P: + +.. code-block:: sql + + ( + SELECT ... + FROM U_p INNER JOIN M_c ON (U_p.pk = M_c.fk_u) + INNER JOIN V_p ON (M_c.fk_v = V_p.pk) + WHERE ... + ) UNION ALL ( + SELECT ... + FROM U_p INNER JOIN M_c ON (U_p.pk = M_c.fk_u) + INNER JOIN OV_p ON (M_c.fk_v = OV_p.pk) + WHERE ... + ) + +U_p and V_p correspond to the subchunk p of chunk c of tables U and V. M_c +corresponds to chunk c of table M, and OV_p is the subset of V containing +points in the full overlap region of subchunk p. Note that queries which +involve only the match table need to be rewritten to discard duplicates, +since a match pair linking positions in two different chunks will be stored +twice (once in each chunk). + +Object movement +--------------- + +The fact all tables will be partitioned according to ``Object`` RA/Dec implies +that in rare cases when an object is close to the partition edge, some of its +sources may end up in a partition that is different from the "natural" one for +the source's own ra/dec. To address this issue, the Qserv master will expand +the search region by an distance R when determining which partitions to query, +where R is large enough to capture object motion for the duration of the LSST +survey to date. The WHERE clause applied by Qserv workers to their ``Source`` +table partitions will use the original search region to ensure that the query +semantics are preserved. Without this, some sources inside the search region +assigned to objects outside the search region *and* that lie across partition +boundaries might be missed. diff --git a/doc/ingest/data/ref-match.rst b/doc/ingest/data/ref-match.rst new file mode 100644 index 0000000000..7065128fb6 --- /dev/null +++ b/doc/ingest/data/ref-match.rst @@ -0,0 +1,236 @@ +.. _ingest-data-partitioning-ref-match: + +Partitioning data of the ref-match tables +========================================= + +Introduction +------------ + +The *ref-match* tables are a special class of tables that are designed to match rows between two independent *director* tables +belonging to the same or different catalogs. In this case, there is no obvious 1-to-1 match between rows of the director tables. +Instead, the pipelines compute a (spatial) match table between the two that provides a many-to-many relationship between both tables. +The complication is that a match record might reference a row (an *object*) and reference row that fall on opposite sides of +the partition boundary (into different chunks). Qserv deals with this by taking advantage of the overlap that must be stored +alongside each partition (this overlap is stored so that Qserv can avoid inter-worker communication when performing +spatial joins on the fly). + +Since the *ref-match* tables are also partitioned tables the input data (CSV) of the tables have to be partitioned into chunks. +In order to partition the *ref-match* tables one would have to use a special version of the partitioning tool sph-partition-matches. +A source code of the tool is found in the source tree of Qserv: https://github.com/lsst/qserv/blob/main/src/partition/sph-partition-matches.cc. +The corresponding binary is built and placed into the binary Docker image of Qserv. + +Here is an example illustrating how to launch the tool from the container: + +.. code-block:: bash + + % docker run -it qserv/lite-qserv:2022.9.1-rc1 sph-partition-matches --help + + sph-partition-matches [options] + + The match partitioner partitions one or more input CSV files in + preparation for loading by database worker nodes. This involves assigning + both positions in a match pair to a location in a 2-level subdivision + scheme, where a location consists of a chunk and sub-chunk ID, and + outputting the match pair once for each distinct location. Match pairs + are bucket-sorted by chunk ID, resulting in chunk files that can then + be distributed to worker nodes for loading. + A partitioned data-set can be built-up incrementally by running the + partitioner with disjoint input file sets and the same output directory. + Beware - the output CSV format, partitioning parameters, and worker + node count MUST be identical between runs. Additionally, only one + partitioner process should write to a given output directory at a + time. If any of these conditions are not met, then the resulting + chunk files will be corrupt and/or useless. + \_____________________ Common: + -h [ --help ] Demystify program usage. + -v [ --verbose ] Chatty output. + -c [ --config-file ] arg The name of a configuration file + containing program option values in a + ... + +The tool has two parameters specifying the locations of the input (CSV) file and the output folder where +the partitioned products will be stored: + +.. code-block:: bash + + % sph-partition-matches --help + .. + \_____________________ Output: + --out.dir arg The directory to write output files to. + \______________________ Input: + -i [ --in.path ] arg An input file or directory name. If the + name identifies a directory, then all + the files and symbolic links to files + in the directory are treated as inputs. + This option must be specified at least + once. + +.. hint:: + + If the tool is launched via the docker command as was shown above, one would have to mount the corresponding + host paths into the container. + +All tables, including both *director* tables and the *ref-match* table itself, have to be partitioned using +the same values of the partitioning parameters, including: + +- The number of stripes +- The number of sub-stripes +- The overlap radius + +Values of the partitioning parameters should be specified using the following options (the default values shown below are meaningless +for any production scenario): + +.. code-block:: bash + + --part.num-stripes arg (=18) The number of latitude angle stripes to + divide the sky into. + --part.num-sub-stripes arg (=100) The number of sub-stripes to divide + each stripe into. + --part.overlap arg (=0.01) Chunk/sub-chunk overlap radius (deg). + +The next sections present two options for partitioning the input data. + +The spatial match within the given overlap radius +------------------------------------------------- + +This is the most reliable way of partitioning the input data of the match tables. It is available when +the input rows of the match table carry the exact spatial coordinates of both matched rows (from the corresponding +*director* tables). + +In this scenario, the input data file (``CSV``) is expected to have 4 columns representing the spatial coordinates +of the matched rows from the *director* tables on the 1st ("left") and on the 2nd ("righ"). Roles and sample names +of the columns are presented in the table below: + +``dir1_ra`` + The *right ascent* coordinate (*longitude*) of the 1st matched entity (from the 1st *director* table). +``dir1_dec`` + The *declination* coordinate (*latitude*) of the 1st matched entity (from the 1st director table). +``dir2_ra`` + The *right ascent* coordinate (*longitude*) of the 2nd matched entity (from the 2nd *director* table). +``dir2_dec`` + The *declination* coordinate (*latitude*) of the 2nd matched entity (from the 2nd director table). + +The names of these columns need to be passed to the partitioning tool using two special parameters: + +.. code-block:: bash + + % sph-partition-matches \ + --part.pos1="dir1_ra,dir1_dec" \ + --part.pos2="dir2_ra,dir2_dec" + +.. note: + + The order of the columns in each packed pair pf columns is important. The names must be separated by commas. + +When using this technique for partitioning the match tables, it's required that the input CSV file(s) had at least those 4 columns +mentioned above. The actual number of columns could be larger. Values of all additional will be copied into the partitioned +products (the chunk files). The original order of the columns will be preserved. + +Here is an example of a sample ``CSV`` file that has values of the above-described spatial coordinates in the first 4 columns +and the object identifiers of the corresponding rows from the matched *director* tables in the last 2 columns: + +.. code-block:: + + 10.101,43.021,10.101,43.021,123456,6788404040 + 10.101,43.021,10.102,43.023,123456,6788404041 + +The last two columns are meant to store values of the following columns: + +``dir1_objectId`` + The unique object identifier of the 1st *director* table. +``dir2_objectId`` + The unique object identifier of the 2nd *director* table. + +The input CSV file shown above could be also presented in the tabular format: + +.. list-table:: + :widths: 10 10 10 10 10 10 + :header-rows: 1 + + * - ``dir1_ra`` + - ``dir1_dec`` + - ``dir2_ra`` + - ``dir2_dec`` + - ``dir1_objectId`` + - ``dir2_objectId`` + * - 0.101 + - 43.021 + - 10.101 + - 43.021 + - 123456 + - 6788404040 + * - 0.101 + - 43.021 + - 10.102 + - 43.023 + - 123456 + - 6788404041 + +Note that this is actually a 1-to-2 match, in which a single object (``123456``) of the 1st director has two matched +objects (``6788404040`` and ``6788404041``) in the 2nd director. Also, note that the second matched object has slightly +different spatial coordinates than the first one. If the value of the overlap parameter is bigger than the difference +between the coordinates then the tool will be able to match the objects successfully. For example, this would work if +a value of the overlap was set to ``0.01``. Otherwise, no match will be made and the row will be ignored by the tool. + +.. _warning: + + It is assumed that the input data of the *ref-match* tables are correctly produced by the data processing + pipelines. Verifying the quality of the input data is beyond the scope of this document. However, one might + consider writing a special tool for pre-scanning the input files and finding problems in the files. + +Here is the complete practical example of how to run the tool with the assumptions made above: + +.. code-block:: bash + + % cat in.csv + 10.101,43.021,10.101,43.021,123456,6788404040 + 10.101,43.021,10.102,43.023,123456,6788404041 + + % cat config.json + { + "part":{ + "num-stripes":340. + "num-sub-stripes":3, + "overlap":0.01, + "pos1":"dir1_ra,dir1_dec", + "pos2":"dir2_ra,dir2_dec" + }, + "in":{ + "csv":{ + "null":"\\N", + "delimiter":",", + "field":[ + "dir1_ra", + "dir1_dec" + "dir2_ra", + "dir2_dec", + "dir1_objectId", + "dir2_objectId" + ] + } + }, + "out":{ + "csv":{ + "null":"\\N", + "delimiter":",", + "escape":"\\", + "no-quote":true + } + } + } + + % mkdir chunks + % sph-partition-matches -c config.json --in.path=in.csv --out.dir=chunks/ + +Partitioning using index maps +----------------------------- + +.. note:: + + This section is under construction. Only the basic idea is presented here. + +This is an alternative way of partitioning the input data of the match tables. It is available when the input rows of the match table +do not carry the exact spatial coordinates of both matched rows (from the corresponding *director* tables). Instead, the input data +has to carry the unique object identifiers of the matched rows. The tool will use the object identifiers to find the spatial coordinates +of the matched rows in the *director* tables. The tool will use the index maps of the *director* tables to find the spatial coordinates +of the matched rows. diff --git a/doc/ingest/index.rst b/doc/ingest/index.rst new file mode 100644 index 0000000000..35c5945541 --- /dev/null +++ b/doc/ingest/index.rst @@ -0,0 +1,14 @@ + +.. highlight:: sql + +################## +Ingesting catalogs +################## + +.. toctree:: + :maxdepth: 2 + + intro + api/index + qserv-ingest/index + data/index diff --git a/doc/ingest/intro.rst b/doc/ingest/intro.rst new file mode 100644 index 0000000000..c7dd9c830a --- /dev/null +++ b/doc/ingest/intro.rst @@ -0,0 +1,39 @@ +.. _ingest-introduction: + +============ +Introduction +============ + +Unlike traditional RDBMS systems, Qserv does not support direct ingestion of data via +SQL ``INSERT`` or MySQL's ``LOAD DATA INFILE`` statements. Neither one can create databases or tables +directly via SQL DDL statements like ``CREATE DATABASE``, ``CREATE TABLE`` and similar. Instead, data +are ingested and managed via the Qserv Ingest REST API which is fully covered in `The Ingest Workflow Developer's Guide `_. +There are several reasons for this design choice: + +- Implementing a parser for the SQL DDL and DML statements is a complex and time-consuming process. + Implementing a correct semantic of the SQL statements in a realm of the distributed database + is an even more daunting task. +- The performance of the SQL-based ingest protocol is not sufficient for high-throughput data ingestion. + - **Note**: Qserv is designed to handle data volumes of the order of many Petabytes. +- The REST services (unlike the simple text-based SQL statements) allow for more *structural* data formats + for user inputs such as schemas (``JSON``) and data (``CSV``). Verifying the syntactical and semantical + correctness of the data is easier when the data are structured. +- The REST services provide a reliable and transparent mechanism for managing and tracking the distributed + state of the data products within Qserv. +- Many operations on the REST services can be made idempotent and can be easily retried in case of failures. +- By not being bound to a particular SQL dialect, the REST services provide a more flexible and portable + interface for data ingestion. The API can be extended to support new types of data management requests, + new data formats, and data sources as needed without changing the core of the Qserv engine. + +The API serves as a foundation for designing and implementing the data ingestion processes that +are loosely called the *ingest workflows*. There may be many such workflows depending on a particular +use case, the amount of data to be ingested, data delivery requirements, and the overall complexity +of the data. + +Read `The Ingest Workflow Developer's Guide `_ for further details on the REST services and their +usage. An explanation of a simple Kubernetes-based ingest workflow application `qserv-ingest `_ +is also provided in this documentation portal. + +Also note that a simple ingest API is provided by :ref:`http-frontend` for ingesting and managing user tables. + +Instructions on preparing the input data for the ingest workflows can be found in the :ref:`ingest-data` section. diff --git a/doc/ingest/qserv-ingest/index.rst b/doc/ingest/qserv-ingest/index.rst new file mode 100644 index 0000000000..83e2348f20 --- /dev/null +++ b/doc/ingest/qserv-ingest/index.rst @@ -0,0 +1,15 @@ +.. _installation-label: + +#################################### +The Kubernetes-based Ingest Workflow +#################################### + +.. toctree:: + :maxdepth: 2 + + input-data + version + run + repcli + itest + diff --git a/doc/admin/qserv-ingest/ingest.yaml b/doc/ingest/qserv-ingest/ingest.yaml similarity index 100% rename from doc/admin/qserv-ingest/ingest.yaml rename to doc/ingest/qserv-ingest/ingest.yaml diff --git a/doc/admin/qserv-ingest/input-data.rst b/doc/ingest/qserv-ingest/input-data.rst similarity index 100% rename from doc/admin/qserv-ingest/input-data.rst rename to doc/ingest/qserv-ingest/input-data.rst diff --git a/doc/admin/qserv-ingest/itest.rst b/doc/ingest/qserv-ingest/itest.rst similarity index 100% rename from doc/admin/qserv-ingest/itest.rst rename to doc/ingest/qserv-ingest/itest.rst diff --git a/doc/admin/qserv-ingest/repcli.rst b/doc/ingest/qserv-ingest/repcli.rst similarity index 100% rename from doc/admin/qserv-ingest/repcli.rst rename to doc/ingest/qserv-ingest/repcli.rst diff --git a/doc/admin/qserv-ingest/run.rst b/doc/ingest/qserv-ingest/run.rst similarity index 100% rename from doc/admin/qserv-ingest/run.rst rename to doc/ingest/qserv-ingest/run.rst diff --git a/doc/admin/qserv-ingest/version.rst b/doc/ingest/qserv-ingest/version.rst similarity index 100% rename from doc/admin/qserv-ingest/version.rst rename to doc/ingest/qserv-ingest/version.rst diff --git a/doc/templates/overview.html b/doc/templates/overview.html index 808666e237..be06e3bb89 100644 --- a/doc/templates/overview.html +++ b/doc/templates/overview.html @@ -7,58 +7,96 @@

Qserv documentation

Welcome! This is the documentation for Qserv {{ version|e }}, last updated {{ last_updated|e }}.

Qserv is an open source MPP SQL database system designed - originally to host the astronomical catalogs of the Legacy - Survey of Space and Time to be conducted by the - Rubin Observatory.

+ originally to host the astronomical catalogs of the Legacy + Survey of Space and Time  to be conducted by the + Rubin Observatory .

-

General documentation for Qserv:

- -
- - -
- - - - - -
-
- -

Indices and tables:

- - - - -
- - - - -
-
- -

External links:

+

General documentation for Qserv

+ + + + + + + + + + + + + + + + + + + + + + + +
+ Introduction + + – what Qserv is and what makes it different from other RDBMS +
+ User's Guide + + – submitting queries and interpreting results +
+ Administrator's Guide + + – installing, configuring and managing Qserv +
+ Ingesting Catalogs + + – interfaces and tools for ingesting and managing data in Qserv +
+ Developer's Guide + + – how Qserv is implemented and how to contribute into it +
+

Indices and tables

+ + + + + + + + + + + + + + + +
+

General index

+
+ – presents all terms and concepts, alphabetically +
+

Glossary

+
+ – Qserv terms explained +
+

Full table of contents

+
+ – list all sections and subsections +
+

External links

+ + + + + + + +
+

Source code 

+
+ – the source code on GitHub +
{% endblock %} \ No newline at end of file diff --git a/doc/user/async.rst b/doc/user/async.rst new file mode 100644 index 0000000000..72d0fa7efa --- /dev/null +++ b/doc/user/async.rst @@ -0,0 +1,174 @@ +.. warning:: Known Issues and Limitations: + + - The API has a confusing inconsistency in the naming convention of identifiers ``ID``, ``job-id``, ``jobId``, + ``query-id`` and ``queryId``. These are used interchangeably in the documentation and error messages posted + by Qserv. In reality, they all refer to the same entity - the autogenerated unique identifiers of the user + queries submitted via this interface. This inconsistency will be addressed in the future. + + - There is no way in the current API to get errors on queries failed during the asynchronous processing. The only + tests made by the ``SUBMIT`` command are to ensure that: + + - the query syntax is valid + - the query refers to the correct context (names of databases, tables, columns) + + Any problems detected by Qserv at the query entry point will be reported right away. + +###################### +Asynchronous Query API +###################### + +This is a summary of Qserv's asynchronous query API, as developed in: + +- https://rubinobs.atlassian.net/browse/DM-4451 + +Submitting Queries +================== + +The general syntax for submitting queries for asynchronous processing is: + +.. code-block:: sql + + SUBMIT + +This is illustrated by the following example: + +.. code-block:: sql + + SUBMIT SELECT objectId FROM dp02_dc2_catalogs.Object LIMIT 3 + +If the query validation succeded and the query placed into a processing queue, the command will always return one row with +two columns, where the particularly interesting column is ``jobId``: + +.. code-block:: + + +--------+---------------------+ + | jobId | resultLocation | + +--------+---------------------+ + | 311817 | table:result_311817 | + +--------+---------------------+ + +At this poing the query is running asynchronously. The ``jobId`` is the unique identifier of the query that can be used +for checking the query status, retrieving the results, or cancelling the query. + +Checking Status +=============== + +Based on the ``jobId`` returned by the ``SUBMIT`` command, you can check the status of the query by querying the +``information_schema.processlist`` table: + +.. code-block:: + + SELECT * FROM information_schema.processlist WHERE id=311817\G + +**Note**: ``\G`` is a MySQL command that formats the output. It's not part of the SQL syntax. +The command is quite handy to display result sets comprising many columns or having long values of +the columns in a more readable format. + +The query will return a row with the following columns: + +.. code-block:: + + *************************** 1. row *************************** + ID: 311817 + USER: anonymous + HOST: NULL + DB: dp02_dc2_catalogs + COMMAND: ASYNC + TIME: NULL + STATE: COMPLETED + INFO: SELECT objectId FROM dp02_dc2_catalogs.Object LIMIT 3 + SUBMITTED: 2024-10-06 20:01:18 + COMPLETED: 2024-10-06 20:01:18 + RETURNED: NULL + CZARID: 9 + RESULTLOCATION: table:result_311817 + NCHUNKS: 1477 + TotalChunks: NULL + CompletedChunks: NULL + LastUpdate: NULL + +Particularly interesting columns here are: + +- ``ID``: the unique identifier of the original query (it's the same as ``jobId`` reported by the ``SUBMIT`` command) +- ``STATE``: the query status, which can be one of: ``EXECUTING``, ``COMPLETED``, ``FAILED``, or ``ABORTED`` + +One can also use the following information commands to get the status of all active queries: + +.. code-block:: sql + + SHOW PROCESSLIST + SHOW FULL PROCESSLIST + +**Note**: once the query is over and the results are retrieved, the corresponding row in the ``information_schema.processlist`` +table will be deleted. And the query status will no longer be available. However, Qserv will still maintain the history +of the queries in other system tables. You may contact the Qserv administrator to get the history of the queries should +you need it. + +Retrieving Results +================== + +To retrieve the results of a query, use the following syntax: + +.. code-block:: sql + + SELECT * FROM qserv_result() + +This will return the full results (columns and rows) of the original query corresponding to the provided identifier of +the query. + +For example, the following query will return the results of the query with ``jobId`` of ``311817``: + +.. code-block:: + + SELECT * FROM qserv_result(311817) + +---------------------+ + | objectId | + +---------------------+ + | 1248649384967536732 | + | 1248649384967536769 | + | 1248649384967536891 | + +---------------------+ + +The command may be called one time only. The query result table will be deleted after returning the result set. +Any subsequent attempts to retrieve the results will return an error message: + +.. code-block:: + + SELECT * FROM qserv_result(311817) + ERROR 1146 (42S02) at line 1: Table 'qservResult.result_311817' doesn't exist + +Cancellation +============ + +The general syntax for cancelling running queries is: + +.. code-block:: sql + + CANCEL + +The following example illustrates the technique for cancelling a query that is supposed to take +a long time to complete (the query produces a very large result set): + +.. code-block:: + + SUBMIT SELECT objectId FROM dp02_dc2_catalogs.Object + +--------+---------------------+ + | jobId | resultLocation | + +--------+---------------------+ + | 311818 | table:result_311818 | + +--------+---------------------+ + + CANCEL 311816 + +--------------------------------------+ + | command | + +--------------------------------------+ + | Trying to kill query: CANCEL 311818 | + +--------------------------------------+ + +Note that it's okay to call the ``CANCEL`` command multiple times. If the query has already completed, the command will +post the following error message: + +.. code-block:: + + CANCEL 311818 + ERROR 4005 (Proxy) at line 1: KILL/CANCEL failed: Unknown or finished query ID: CANCEL 311818 diff --git a/doc/user/http-frontend-general.rst b/doc/user/http-frontend-general.rst new file mode 100644 index 0000000000..990dfc4bb4 --- /dev/null +++ b/doc/user/http-frontend-general.rst @@ -0,0 +1,164 @@ + +General guidelines +================== + +This section provides an overview of the general guidelines for utilizing the REST services of the frontend. + +JSON +--------- + +All services, regardless of the HTTP *method* (``GET``, ``POST``, etc.), report results in JSON objects. +The schema of these objects varies depending on the service called. Detailed explanations of +the schemas can be found in the corresponding sections where the services are described. + +.. _http-frontend-general-error-reporting: + +Error reporting +--------------- + +When a service receives a request, it will always return a response with the HTTP code ``200``. +The actual completion status of the request is indicated in the JSON object: + +.. code-block:: + + { "success" : , + "error" : , + "error_ext" : , + "warning" : + } + +The following attributes are related to the completion status: + +``success`` : *number* + The flag indicating if the request succeeded: + + - ``0`` if a request failed (then see attributes ``warning``, ``error``, and ``error_ext``) + - any other number if a request succeeded (also inspect the attribute ``warning`` for non-critical notifications) + +``error`` : *string* + An explanation of the error (if any). + +``error_ext`` : *object* + The optional information on the error (if any). The content of the object is request-specific. + Details can be found in the reference section of the corresponding request. + +``warning`` : *string* + The optional warning for on-critical conditions. Note that warnings may be posted for both + completed or failed requests. + +Other HTTP codes (``3xx``, ``404``, ``5xx``, etc.) could also be returned by the frontend's HTTP server or intermediate proxy servers. + +Protocol Versioning +------------------- + +The API adheres to the optional version control mechanism introduced in: + +- https://rubinobs.atlassian.net/browse/DM-35456 + +Application developers are encouraged to use the mechanism to reinforce the +integrity of the applications. + +Client applications can utilize version numbers in two ways: + +- *Pull mode*: The client explicitly requests the version implemented by the frontend and compares it with the expected version. +- *Push mode*: The client includes the expected version number as a parameter in service requests, allowing the services to verify + if the version matches the frontend implementation. + +Developers can choose to use either method, both, or neither to ensure application integrity. + +Pull mode +^^^^^^^^^ + +To support the first scenario, the API provides a special metadata service that returns +the version number along with additional information about the frontend: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/meta/version`` + +The request object for this service is not required, or it can be an empty JSON object ``{}``. +Upon successful completion, the service will return a JSON object containing the following attributes: + +.. code-block:: + + { "kind" : , + "name" : , + "id" : , + "instance_id" : , + "version" : + } + +Where: + +``kind`` : *string* + The name of the service. The current implementation always reports: + + .. code-block:: + + qserv-czar-query-frontend + +``name`` : *string* + The unique name of the frontend within a given Qserv. The current implementation always reports: + + .. code-block:: + + http + +``id`` : *number* + The numeric identifier of the frontend within a given Qserv. The number returned here may vary. + +``instance_id`` : *string* + An identifier of the Qserv. A value of the attribute depends on a particular deployment of Qserv. + +``version`` : *number* + The current version number of the API. + +Example: + +.. code-block:: json + + { "kind" : "qserv-czar-query-frontend", + "name" : "http", + "id" : 8, + "instance_id" : "qserv-prod", + "version" : 38, + "success" : 1 + } + +Push mode +^^^^^^^^^ + +In the case of the second scenario, an application will pass the desired version number as +a request parameter. The number would be a part of the request's query for the method. For example, +the following request for checking the status of an ongoing query might look like this: + +.. code-block:: bash + + curl -k 'https://localhost:4041/query-async/status/1234?version=38' -X GET + +For other HTTP methods used by the API, the version number must be included in the body of the request as shown below: + +.. code-block:: bash + + curl -k 'https://localhost:4041/query-async' -X POST \ + -H 'Content-Type: application/json' \ + -d'{"version":38,"query":"SELECT ..."}' + +If the number does not match expectations, such a request will fail and the service will return the following +response. Here is an example of what will happen if the wrong version number ``29`` is specified instead +of ``38`` (as specified in the example above): + +.. code-block:: json + + { "success" : 0, + "error" : "The requested version 29 of the API is not in the range + supported by the service.", + "error_ext" : { + "max_version" : 38, + "min_version" : 30 + }, + "warning" : "" + } diff --git a/doc/user/http-frontend-ingest.rst b/doc/user/http-frontend-ingest.rst new file mode 100644 index 0000000000..c5f64e8458 --- /dev/null +++ b/doc/user/http-frontend-ingest.rst @@ -0,0 +1,360 @@ +.. _http-frontend-ingest: + +The User Table Ingest Interface +=============================== + +The frontend provides a simple interface for ingesting and managing user-defined tables in Qserv. Key features and limitations include: + +- Supports creation and ingestion of simple (non-partitioned) tables only. +- Table sizes are constrained by the frontend's available memory: + + - Practical limit is a few GB. + - Limit may be lower for multiple simultaneous table ingest requests. + +- Only synchronous interface is currently supported. + + - **Asynchronous interface is under evaluation.** + +- Schema definition and table data are sent to the frontend in a single JSON object. +- Clients can request optional indexes on ingested tables during the ingest process. +- User-defined tables are automatically created by the request processor within the user databases. + + - The service enforces a specific naming convention for user databases to avoid conflicts with production data products in Qserv. + - The naming convention is detailed in the relevant section of the document. + +- Users can delete tables or their corresponding databases. + + - **Currently, the service does not support authentication/authorization or user quota control.** + +- No backups are provided for tables ingested using this mechanism. +- Binary data types are supported. + + - **Binary data must be converted to an appropriate format for inclusion in the JSON request object.** + - See: :ref:`ingest-general-binary-encoding` (REST) + +- The API follows the same :ref:`http-frontend-general-error-reporting` scheme as adopted for :ref:`http-frontend-query`. + +Ingesting tables +---------------- + +The following REST service implements the synchronous interface for ingesting a table into Qserv: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/ingest/data`` + +A client must include a JSON object in the request body to specify the operation to be performed. The object follows this schema: + +.. code-block:: + + { "database" : , + "table" : , + "binary_encoding" : , + "timeout" : , + "schema" : , + "indexes" : , + "rows" : + } + +Where: + +``database`` : *string* + The required name of a user database. The names of the databases must start with the following prefix: + + .. code-block:: + + user_ + + The rest of the name should include the name of a user or a role. For example: + + .. code-block:: + + user_gapon + +``table`` : *string* + The required name of a table. Table names may not start with the following prefix: + + .. code-block:: + + qserv_ + + This prefix is reserved for naming internal tables that Qserv places into user databases. + +``binary_encoding`` : *string* = ``hex`` + The optional binary encoding of the binary data in the table. For further details see: + + - :ref:`ingest-general-binary-encoding` (REST) + +``schema`` : *array* + The required schema definition. Each element of the array defines a column: + + .. code-block:: + + { "name" : , + "type" : + } + + Where: + + ``name`` + The name of a column + ``type`` + A valid MySQL type + + **Note**: The service preserves the column order when creating a table. + +``indexes`` : *array* = ``[]`` + The optional indexes will be created after ingesting the table. + + More information on the index specification requirements can be found in the dedicated section of the document: + :ref:`http-frontend-ingest-indexes`. + +``rows`` : *array* + The required collection of the data rows to be ingested. Each element of the array represents a complete row, + where elements of the row correspond to the values of the respective columns: + + .. code-block:: + + [ [ , ... ], + ... + [ , ... ] + ] + + The number of elements in each row must match the number of columns defined in the table schema. In case of a mismatch, + the service will complain and refuse to execute the request. + + The order and types of elements in each row should correspond to the order and types of the corresponding columns in + the table schema. The service will attempt to convert the data to the appropriate types. If the conversion fails, the + service will refuse to execute the request. + +``timeout`` : *number* = ``300`` + The optional timeout (in seconds) that limits the duration of the internal operations initiated by the service. + In practical terms, this means that the total wait time for the completion of a request will not exceed the specified timeout. + + **Note**: The number specified as a value of the attribute can not be ``0``. + +A call to this service will block the client application for the time required to create +a database (if it does not already exist), create a table, process and ingest the data, and perform +additional steps (such as creating indexes). The request will fail if it exceeds the specified (or implied) timeout. + +Here is an example of the simple table creation specification: + +.. code-block:: json + + { "version" : 38, + "database" : "user_gapon", + "table" : "employee", + "schema" : [ + { "name" : "id", "type" : "INT" }, + { "name" : "val", "type" : "VARCHAR(32)" }, + { "name" : "active", "type" : "BOOL" } + ], + "rows" : [ + [ "123", "Igor Gaponenko", 1 ], + [ "2", "John Smith", 0 ] + ] + } + +The description could be pushed to the service using: + +.. code-block:: bash + + curl -k 'https://localhost:4041/ingest/data' -X POST \ + -H 'Content-Type: application/json' \ + -d'{"version":38,"database":"user_gapon",..}' + +If the request succeeds then the following table will be created: + +.. code-block:: sql + + CREATE TABLE `user_gapon`.`employee` ( + + `qserv_trans_id` int(11) NOT NULL, + `id` int(11) DEFAULT NULL, + `val` varchar(32) DEFAULT NULL, + `active` tinyint(1) DEFAULT NULL, + + ) ENGINE=MyISAM DEFAULT CHARSET=latin1; + +.. _http-frontend-ingest-indexes: + +Creating indexes +^^^^^^^^^^^^^^^^ + +.. note:: + + For detailed information on the schema of the index specifications, please refer to the following document: + + - :ref:`admin-data-table-index` (ADMIN) + +Indexes, if needed, must be specified in the ``indexes`` attribute of the table creation request. This attribute is a JSON array, +where each entry represents an index specification: + +.. code-block:: + + { "version" : 38, + "indexes" : [ + { "index" : , + "spec" : , + "comment" : , + "columns" : [ + { "column" : , "length" : , "ascending" : }, + ... + ] + }, + ... + ], + ... + } + +A few notes: + +- It is possible to create one or many indexes in such specifications. +- Index names (attribute ``index``) must be unique for the table. +- An index may involve one or many columns as specified in the array ```columns```. +- Index comment (attribute ``comment``) is optional. +- Other attributes are mandatory. + +Here is an example of the earlier presented simple table creation specification which also +includes an index specification: + +.. code-block:: json + + { "version" : 38, + "database" : "user_gapon", + "table" : "employee", + "schema" : [ + { "name" : "id", "type" : "INT" }, + { "name" : "val", "type" : "VARCHAR(32)" }, + { "name" : "active", "type" : "BOOL" } + ], + "rows" : [ + [ "123", "Igor Gaponenko", 1 ], + [ "2", "John Smith", 0 ] + ], + "indexes" : [ + { "index" : "idx_id", + "spec" : "UNIQUE", + "comment" : "This is the primary key index", + "columns" : [ + { "column" : "id", "length" : 0, "ascending" : 1 } + ] + } + ] + } + +This specification will result in creating the following table: + +.. code-block:: sql + + CREATE TABLE `user_gapon`.`employee` ( + + `qserv_trans_id` int(11) NOT NULL, + `id` int(11) DEFAULT NULL, + `val` varchar(32) DEFAULT NULL, + `active` tinyint(1) DEFAULT NULL, + + UNIQUE KEY `idx_id` (`id`) COMMENT 'This is the primary key index' + + ) ENGINE=MyISAM DEFAULT CHARSET=latin1; + +Deleting tables +--------------- + +Existing tables can be deleted with the following service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/ingest/table/:database/:table`` + +Where: + +``database`` : *string* + The required name of the user database containing the table to be deleted. + + **Note**: Database names must start with the following prefix: + + .. code-block:: + + user__ + +``table`` : *string* + The required name of a table to be deleted. + +For example: + +.. code-block:: bash + + curl -k 'https://localhost:4041/ingest/table/user_gapon/employees' -X DELETE \ + -H 'Content-Type: application/json' \ + -d'{"version":38}' + +A few notes: + +- Option ``-k`` is used to ignore the SSL certificate verification. +- The sender's content header (option ``-H``) is required by the service. +- The request's body can be empty. However, it needs to be a valid JSON object, such as ``{}``. +- The present implementation of the service doesn't provide user authentication/authorization + services that prevent the deletion of someone else's tables. + +Deleting user databases +----------------------- + +Existing databases (including all tables within such databases) can be deleted with the following service: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/ingest/database/:database`` + +Where: + +``database`` : *string* + The required name of a database to be deleted. + + **Note**: Database names must start with the following prefix: + + .. code-block:: + + user__ + +For example: + +.. code-block:: bash + + curl -k 'https://localhost:4041/ingest/database/user_gapon' -X DELETE \ + -H 'Content-Type: application/json' \ + -d'{"version":38}' + +A few notes: + +- The ``-k`` option is used to ignore SSL certificate verification. +- The ``-H`` option is required to specify the content type as JSON. +- The request body can be empty but must be a valid JSON object, such as ``{}``. +- The current implementation does not provide authentication/authorization to prevent + the deletion of other users' databases. + +Possible extensions of the table ingest service +----------------------------------------------- + +.. warning:: + + None of the improvements mentioned below have been implemented. This section is primarily + to outline potential future enhancements. + +Potential enhancements for the table ingest service include: + +- Adding services to manage (create or drop) indexes on existing tables. +- Introducing a service for asynchronous table ingests. +- Implementing a service to track the status and progress of asynchronous requests. +- Providing a service to cancel queued asynchronous requests. +- Supporting table ingests using the ``multipart/form-data`` format, where data is sent as + a ``CSV``-formatted file. diff --git a/doc/user/http-frontend-query.rst b/doc/user/http-frontend-query.rst new file mode 100644 index 0000000000..87230e2089 --- /dev/null +++ b/doc/user/http-frontend-query.rst @@ -0,0 +1,423 @@ +.. _http-frontend-query: + +The Query Interface +=================== + +Submitting queries +------------------ + +.. _http-frontend-query-sync: + +Synchronous interface +^^^^^^^^^^^^^^^^^^^^^ + +The following REST service implements the synchronous interface: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/query`` + +To specify the query to be executed, a client must include the following JSON object in the request body: + +.. code-block:: + + { "query" : , + "database" : , + "binary_encoding" : + } + +Where: + +``query`` : *string* + The required query text in which the default database may be missing. In the latter case, + a client needs to provide the name in a separate parameter. + +``database`` : *string* = ``""`` + The optional name of the default database for queries where no such name was provided explicitly. + +``binary_encoding`` : *string* = ``hex`` + The optional binary encoding of the binary data in the table. For further details see: + + - :ref:`ingest-general-binary-encoding` (REST) + +A call to this service will block a client application until one of the following events occurs: + +- The query is successfully processed, and its result is returned to the caller. +- The query fails, and the corresponding error is returned to the caller. +- The frontend becomes unavailable (due to a crash, restart, or networking problem, etc.), and the network connection is lost. + +If the request is successfully completed, the service will return a result set in the JSON object explained in the section Result sets. + +.. _http-frontend-query-async: + +Asynchronous interface +^^^^^^^^^^^^^^^^^^^^^^ + +The following REST service implements the asynchronous interface: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``POST`` + - ``/query-async`` + +A client is required to pass the JSON object in the request body to specify what query +needs to be executed. The object has the following schema: + +.. code-block:: + + { "query" : , + "database" : + } + +Where: + +``query`` : *string* + The required query text in which the default database may be missing. In the latter case, a client needs to provide + the name in a separate parameter. + +``database`` : *string* = ``""`` + The optional name of the default database for queries where no such name was provided explicitly. + +A call to this service will normally block a client application for a short period until +one of the following will happen: + +- The query will be successfully analyzed and queued for asynchronous processing by Qserv. + In this case, a response object with the unique query identifier will be returned to a caller. +- The query will fail and the corresponding error will be returned to the caller. +- The frontend will become unavailable (due to a crash, restart, or networking problem, etc.) + and a network connection will be lost. + +In case of the successful completion of the request, the service will return the following JSON object: + +.. code-block:: + + { "queryId" : + } + +The number reported in the object should be further used for making the following requests explained +in the dedicated subsections below: + +- checking the status of the query to see when it's finished +- requesting a result set of the query +- or, canceling the query if needed + +Checking the status of the ongoing query +----------------------------------------- + +This service also allows checking the status of queries submitted via the synchronous +interface, provided the unique identifier of such query is known to the user: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``GET`` + - ``/query-async/status/:queryId`` + +Where: + +``queryId`` : *number* + The required unique identifier of the previously submitted query. + +If the query identifier is not valid, the service will report an error in the response object. +For example, consider the following request: + +.. code-block:: bash + + curl -k 'https://localhost:4041/query-async/status/123?version=38' -X GET + +It might result in the following response: + +.. code-block:: json + + { "success" : 0, + "error" : "failed to obtain info for queryId=123, + ex: Czar::getQueryInfo Unknown user query, err=, + sql=SELECT status,messageTable,resultQuery FROM QInfo WHERE queryId=123", + "error_ext" : {}, + } + +If the query identifier is valid then the following object will be returned: + +.. code-block:: + + { "success" : 1, + ... + "status" : { + "queryId" : , + "status" : , + "totalChunks" : , + "completedChunks" : , + "queryBeginEpoch" : , + "lastUpdateEpoch" : + } + } + +Where the ``status`` is an object that has following attributes: + +``queryId`` : *number* + The unique identifier of the previously submitted query. + +``status`` : *string* + The current status of the query can have one of the following values: + + - ``EXECUTING`` - The query processing is still in progress. + - ``COMPLETED`` - The query has been completed. + - ``FAILED`` - The query failed. + - ``ABORTED`` - The query was aborted: + + - explicitly by a user using the query cancellation REST service explained in the document. + - or, implicitly by Qserv if the intermediate result set of the query exceeds the large result + limit (which is configured by the Qserv administrators). + - or, implicitly when the query processing service was restarted due to some failure or by + Qserv administrators. + +``totalChunks`` : *number* + The total number of so-called "chunks" (spatial shards used for splitting the large tables in Qserv + into smaller sub-tables to be distributed across worker nodes of Qserv). + +``completedChunks`` : *number* + The number of chunks that have been processed by Qserv so far. The value of this parameter varies + from ``0`` to the maximum number reported in the attribute ``totalChunks``. + +``queryBeginEpoch`` : *number* + The 32-bit number representing the start time of the query expressed in seconds since the UNIX *Epoch*. + +``lastUpdateEpoch`` : *number* + The 32-bit number represents the last time when the query status was recorded/updated by the Qserv + internal monitoring system. The timestamp is the number of seconds since the UNIX *Epoch*. + The service returns a value of ``0`` if either of the following is true: + + - the query processing didn't start + - the requst wasn't inspected by the monitoring system + +Here is an example of the status inquiry request that succeeded: + +.. code-block:: + + { "success" : 1, + "status" : { + "queryId" : 310554, + "status" : "EXECUTING", + "totalChunks" : 1477, + "completedChunks" : 112, + "queryBeginEpoch" : 1708141345, + "lastUpdateEpoch" : 1708141359 + } + } + +Users can use the status service to estimate when the query will finish. Typically, client +applications should wait until the query status is "COMPLETED" before fetching +the result set by calling the next service explained below. + +.. _http-frontend-query-async-result: + +Requesting result sets +---------------------- + +The query results are retrieved by calling the following service: + +.. list-table:: + :widths: 10 25 65 + :header-rows: 1 + + * - method + - service + - query parameters + * - ``GET`` + - ``/query-async/result/:queryId`` + - ``binary_encoding=`` + +Where: + +``queryId`` : *number* + The required unique identifier of the previously submitted query. + +``binary_encoding`` : *string* = ``hex`` + The optional format for encoding the binary data into JSON: + + - ``hex`` - for serializing each byte into the hexadecimal format of 2 ASCII characters per each byte of + the binary data, where the encoded characters will be in a range of ``0 .. F``. In this case, + the encoded value will be packaged into the JSON string. + - ``b64`` - for serializing bytes into a string using the Base64 algorithm with + padding (to ensure 4-byte alignment). + - ``array`` - for serializing bytes into the JSON array of numbers in a range of ``0 … 255``. + + Here is an example of the same sequence of 4-bytes encoded into the hexadecimal format: + + .. code-block:: + + "0A11FFD2" + + The array representation of the same binary sequence would look like this: + + .. code-block:: + + [10,17,255,210] + +Like in the case of the status inquiry request, if the query identifier is not valid then +the service will report an error in the response object. Otherwise, a JSON object explained +in the section :ref:`http-frontend-query-resultsets` will be returned. + +.. _http-frontend-query-resultsets: + +Result sets +^^^^^^^^^^^ + +Both flavors of the query submission services will return the following JSON object in case of +the successful completion of the queries (**Note**: comments ``//`` used in this example are not allowed in JSON): + +.. code-block:: + + { "schema" : [ + + // Col 0 + { "table" : , + "column" : , + "type" : , + "is_binary" : + }, + + // Col 1 + { "table" : , + "column" : , + "type" : , + "is_binary" : + }, + + ... + + // Col (NUM_COLUMNS-1) + { "table" : , + "column" : , + "type" : , + "is_binary" : + } + ], + + "rows" : [ + + // Col 0 Col 1 Col (NUM_COLUMNS-1) + // -------- -------- ------------------ + [ , , ... ], // Result row 0 + [ , , ... ], // Result row 1 + ... + [ , , ... ] // Result row (NUM_ROWS-1) + ] + } + +Where: + +``schema`` : *array* + A collection of rows, in which each row is a dictionary representing a definition of + the corresponding column of the result set: + + ``table`` : *string* + The name of the table the column belongs to. + + ``column`` : *string* + The name of the column. + + ``type`` : *string* + The MySQL type of the column as in the MySQL statement: + + .. code-block:: sql + + CREATE TABLE ... + + ``is_binary`` : *number* + The flag indicating if the column type represents the binary type. + A value that is not ``0`` indicates the binary type. + The MySQL binary types are documented in the corresponding sections of the MySQL Reference Manual: + + - `The BINARY and VARBINARY Types `_ + - `The BLOB and TEXT Types `_ + - `Bit-Value Type - BIT `_ + + **Attention**: Binary values need to be processed according to a format specified in the optional + attribute "binary_encoding" in: + + - Processing responses of query requests submnitted via the :ref:`http-frontend-query-sync` + - :ref:`http-frontend-query-async-result` of queries submitted via the asynchronous interface + +``rows`` : *array* + A collection of the result rows, where each row is a row of strings representing values at positions + of the corresponding columns (see schema attribute above). + +For example, consider the following query submission request: + +.. code-block:: bash + + curl -k 'https://localhost:4041/query' -X POST-H 'Content-Type: application/json' \ + -d'{"version":38,"query":"SELECT objectId,coord_ra,coord_dec FROM dp02_dc2_catalogs.Object LIMIT 5"}' + +The query could return: + +.. code-block:: json + + { "schema":[ + { "column" : "objectId", "table" : "", "type" : "BIGINT", "is_binary" : 0 }, + { "column" : "coord_ra", "table" : "", "type" : "DOUBLE", "is_binary" : 0 }, + { "column" : "coord_dec","table" : "", "type" : "DOUBLE", "is_binary" : 0 }], + "rows":[ + [ "1248640588874548987", "51.5508603", "-44.5061095" ], + [ "1248640588874548975", "51.5626104", "-44.5061529" ], + [ "1248640588874548976", "51.5625138", "-44.5052961" ], + [ "1248640588874548977", "51.3780995", "-44.5072101" ], + [ "1248640588874548978", "51.374245", "-44.5071616" ]], + "success" : 1, + "warning" : "", + "error" : "", + "error_ext" : {} + } + +Canceling queries +----------------- + +.. note:: + + This service can be used for terminating queries submitted via the synchronous or asynchronous + interfaces, provided the unique identifier of such query is known to a user. + +The status of the query can be checked using: + +.. list-table:: + :widths: 10 90 + :header-rows: 0 + + * - ``DELETE`` + - ``/query-async/`` + +Where: + +``queryId`` : *number* + The required unique identifier of the previously submitted query. + +If the query identifier is not valid, the service will report an error in the response object. For example, consider the following request: + +.. code-block:: bash + + curl -k 'https://localhost:4041/query-async/123?version=38' -X DELETE + +It might result in the following response: + +.. code-block:: json + + { "success": 0, + "error" : "failed to obtain info for queryId=123, + ex: Czar::getQueryInfo Unknown user query, err=, + sql=SELECT status,messageTable,resultQuery FROM QInfo WHERE queryId=123", + } + +If the query identifier is valid then the following object will be returned: + +.. code-block:: json + + { "success" : 1 + } + diff --git a/doc/user/http-frontend.rst b/doc/user/http-frontend.rst new file mode 100644 index 0000000000..5de379bba4 --- /dev/null +++ b/doc/user/http-frontend.rst @@ -0,0 +1,35 @@ + +.. note:: + + - This guide corresponds to version **38** of the Qserv REST API. Note that each API implementation has a specific version. + The version number will change if any modifications to the implementation or API that might affect users are made. + This document will be updated to reflect the latest API version. + - As of version **38**, all communications with the service are over SSL/TLS encrypted connections. + The service will not accept unencrypted connections. Use the ``-k`` option with ``curl`` to bypass SSL certificate + verification if necessary. + +.. _http-frontend: + +###################### +HTTP frontend of Qserv +###################### + +.. toctree:: + :maxdepth: 4 + + http-frontend-general + http-frontend-query + http-frontend-ingest + +This document describes the HTTP-based frontend for interacting with Qserv. This frontend complements +the one based on ``mysql-proxy``. + +Key features of the API presented in this document include: + +- Utilizes the HTTP protocol. +- Returns result sets (data and schema) in JSON objects. +- Supports both *synchronous* and *asynchronous* query submission operations. +- Offers a mechanism for tracking the progress of asynchronously submitted queries. +- Allows for query cancellation. +- Provides a simple interface for ingesting and managing user tables. +- Ensures protocol versioning to maintain the integrity of distributed applications. diff --git a/doc/user/index.rst b/doc/user/index.rst index d1f4ccbe62..2a9391affd 100644 --- a/doc/user/index.rst +++ b/doc/user/index.rst @@ -1,519 +1,10 @@ -.. warning:: +############ +User's Guide +############ - **Information in this guide is known to be outdated.** A documentation sprint is underway which will - include updates and revisions to this guide. +.. toctree:: + :maxdepth: 2 -.. highlight:: sql - -########## -User Guide -########## - -Introduction -============ - -LSST Query Services (Qserv) provides access to the LSST Database Catalogs. Users can query the catalogs using -standard SQL query language with a few restrictions described below. Why restricting it? We are intercepting -all queries, rewriting them and executing each large query as many sub-queries in parallel. Introducing these -restrictions greatly simplifies parsing incoming queries. - -Our parser uses an open-source grammar built from the SQL92 specification. It does not include anything beyond -SQL92, with minor exceptions (e.g., != is allowed as well as <>). - -The simplest way to look at it is to treat it as a MySQL database server, modulo restrictions and extensions -described in this document. - -If you run into any syntax that this Manual fails to document, please report it through the Qserv mailing -list. - - -Selected Design Aspects -======================= - -Partitioning and Sharding -------------------------- - -Qserv has been designed to handle large volumes of data that can be partitioned in one or more dimensions (for -example, by spacial locality). Once the partitioning column(s) are selected and partitioning parameters (such -as partition size) are chosen, loaded data gets directed to appropriate partitions ("chunks"), and chunks are -distributed (sharded) across nodes available in the cluster. - -In such scheme, a single large table typically consists of many chunks (it could even be tens of thousands of -chunks). This helps with running full-table-scan queries, as each such query can be executed in pieces, in -parallel. For example, imagine we have an Object table that we split into x chunks. Then Qserv will execute:: - - SELECT * from Object - -as:: - - SELECT * from Object_00001 - SELECT * from Object_00002 - ... - SELECT * from Object_x - -in parallel. - - -Director Table --------------- - -Often, multiple large tables need to be partitioned and joined together. To ensure joining such tables is -possible without sending lots of information between nodes, Qserv has a way to ensure all related chunks -always end up on the same machine. To enable that, Qserv has a notion of "Director Table", which "drives" -partitioning. For example, consider two tables:: - - TABLE Object ( - objectId BIGINT PRIMARY KEY, # unique identifier - ra DOUBLE, # spatial location (right ascension) - decl DOUBLE # spatial location (declination) - ) - -which contains information about astronomical objects (galaxies, stars), one row = one object, and:: - - TABLE Source ( - sourceId BIGINT PRIMARY KEY, # unique identifier - objectId BIGINT, # pointer to corresponding object - # Note, there maybe many sources per object - ra DOUBLE # spatial location (right ascension) - decl DOUBLE # spatial location (declination) - ) - -which contains information about individual detections of astronomical objects, one row = one detection of one -object. - -Note that astronomical objects tend to move, so individual detections of the same object might have different -ra/decl positions than the "average" location represented by the ra/decl of their corresponding object. - -If we elect the Object table to be the "Director Table", not only the Object table will be partitioned -according to its ra/decl values, but more importantly, Source table will be partitioned based on the ra/decl -of corresponding objects. - - -Secondary Index ---------------- - -The sharding scheme described above has a problem with locating data by objectId. To alleviate this, Qserv -maintains a specialized index that maps a primary key of the director table to a chunkId of the chunk that -contains a given row. Consider a query:: - - SELECT * from Object WHERE objectId = - -behind the scene, it will be executed as:: - - SELECT chunkId FROM IdToChunkMapping WHERE objectId = - -which is a quick index lookup, followed by :: - - SELECT * from Object_ WHERE objectId = - -which is another quick index lookup inside one small chunk. - -Note that the use of secondary index has some restrictions, as explained in the restrictions section below. - -By the way, do not attempt to issues queries directly on our internal chunk tables. It is blocked. - - -Extensions -========== - -This section covers extensions to sql which we introduced. - - -Spatial Constraints -------------------- - -Spatial constraints in Qserv can be expressed using one of the functions we introduced. Currently supported:: - - qserv_areaspec_box( - lonMin DOUBLE PRECISION, # [deg] Minimum longitude angle - latMin DOUBLE PRECISION, # [deg] Minimum latitude angle - lonMax DOUBLE PRECISION, # [deg] Maximum longitude angle - latMax DOUBLE PRECISION # [deg] Maximum latitude angle - ) - - qserv_areaspec_circle( - lon DOUBLE PRECISION, # [deg] Circle center longitude - lat DOUBLE PRECISION, # [deg] Circle center latitude - radius DOUBLE PRECISION # [deg] Circle radius - ) - - qserv_areaspec_ellipse( - lon DOUBLE PRECISION, # [deg] Ellipse center longitude - lat DOUBLE PRECISION, # [deg] Ellipse center latitude - semiMajorAxisAngle DOUBLE PRECISION, # [arcsec] Semi-major axis length - semiMinorAxisAngle DOUBLE PRECISION, # [arcsec] Semi-minor axis length - positionAngle DOUBLE PRECISION # [deg] Ellipse position angle, east of north - ) - - qserv_areaspec_poly( - v1Lon DOUBLE PRECISION, # [deg] Longitude angle of first polygon vertex - v1Lat DOUBLE PRECISION, # [deg] Latitude angle of first polygon vertex - v2Lon DOUBLE PRECISION, # [deg] Longitude angle of second polygon vertex - v2Lat DOUBLE PRECISION, # [deg] Latitude angle of second polygon vertex - ... - ) - -Example:: - - SELECT objectId - FROM Object - WHERE qserv_areaspec_box(0, 0, 3, 10) - -Note that as discussed in the "Restrictions" section below, spatial constraints **must** be expressed through -the qserv_areaspec\_* functions. - - -Restrictions -============ - -This section covers restriction you need to be aware of when interacting with Qserv. - - -Spatial constraints should be expressed through our qserv_areaspec\_* functions -------------------------------------------------------------------------------- - -Spatial constraints should be expressed through qserv_areaspec\_* functions (see Extensions section above for -details). Any other way of specifying spatial restrictions may be significantly slower (e.g., they might -devolve to be full table scan). For example, the form:: - - WHERE ra BETWEEN AND - AND decl BETWEEN AND - -even though it is equivalent to: - - qserv_areaspec_box(, , , ) - -should not be used. - - -Spatial constraints must appear at the beginning of WHERE ---------------------------------------------------------- - -Spatial constraint must appear at the very beginning of the WHERE clause (before or after the objectId -constraint, if there is any). - - -Only one spatial constraint is allowed per query ------------------------------------------------- - -Only one spatial constraint expressed through qserv_areaspec\_* is allowed per query, e.g., these are examples -of invalid queries:: - - WHERE qserv_areaspec_box(1, 35, 2, 38) - AND qserv_areaspec_box(5, 77, 6, 78) - -or :: - - WHERE qserv_areaspec_box(1, 35, 2, 38) - AND qserv_areaspec_circle(5, 77, 0.1) - - -Arguments passed to spatial constraints functions must be simple literals -------------------------------------------------------------------------- - -The arguments passed to the qserv_aresspec\_ functions must be simple literals. They may not contain any -references, e.g. may not refer to columns. - -Example of an invalid entry:: - - WHERE qserv_areaspec_box(3+4, ra*2, 0, 0) - - -OR is not allowed after qserv_areaspec\_* constraint ----------------------------------------------------- - -If the query has extra constraints after the qserv_areaspec\_* constraint, OR is not allowed immediately after -qserv_areaspec\_*, for example:: - - SELECT objectId, ra, decl, x - FROM Object - WHERE qserv_areaspec_box(1, 35, 2, 38) - AND x > 3.5 - -is valid, but - - SELECT objectId, ra, decl, x - FROM Object - WHERE qserv_areaspec_box(1, 35, 2, 38) - OR x > 3.5 - -is not allowed. We expect to remove this restriction in the future, see -`DM-2888 `_. - - -Secondary index constraint must be expressed through "=", "IN", or "BETWEEN" ----------------------------------------------------------------------------- - -If the query has objectId constraint, it should be expressed in one of these three forms:: - - SELECT * FROM Object WHERE objectId = 123 - - SELECT * FROM Object WHERE objectId IN (123, 453, 3465) - - SELECT * FROM Object WHERE objectId BETWEEN 123 AND 130 - -E.g., don't try to express it as "WHERE objectId != 1", or WHERE objectId > 123 etc. - -Note, we expect to allow decomposing objectId into bitfields (e.g., for sampling) in the future. See -`DM-2889 `_. - - -Column(s) used in ORDER BY or GROUP BY must appear in SELECT ------------------------------------------------------------- - -At the moment we require columns used in ORDER BY or GROUP BY to be listed in SELECT. Example of an invalid -query:: - - SELECT x - FROM T - ORDER BY y - -Correct version:: - - SELECT y, x - FROM T - ORDER BY y - - -Expressions/functions in ORDER BY clauses are not allowed ---------------------------------------------------------- - -In SQL92 ORDER BY is limited to actual table columns, thus expressions or functions in ORDER BY are rejected. -This is true for Qserv too. - -Example of an invalid ORDER BY:: - - SELECT id, ABS(x) - FROM Source - ORDER BY ABS(x) - -However, one can bypass this by using an alias, for example:: - - SELECT id, ABS(x) as ax - FROM Source - ORDER BY ax - - -Sub-queries are NOT supported ------------------------------ - -Sub queries are not supported. - - -Commands that modify tables are disallowed ------------------------------------------- - -Commands for creating or modifying tables are disabled. These commands include "INSERT, UPDATE, LOAD INTO, -CREATE, ALTER, TRUNCATE, DROP". We will revisit this as we start adding support for Level 3. - - -Outer joins are not supported with near-neighbor queries --------------------------------------------------------- - -Qserv does not support LEFT or RIGHT joins with near-neighbor predicates. - - -MySQL-specific syntax is not supported --------------------------------------- - -MySQL-specific syntax is not supported. Example of unsupported syntax that will be rejected: NAME_CONST. - - -Repeated column names through * are not supported -------------------------------------------------- - -Queries with a * that resolves to repeated column name are not supported. Example:: - - SELECT *, id - FROM Object - -will fail if the table Object has a column called "id". Similarly, this query will fail:: - - SELECT o.*, s.* - FROM Object AS o, - Source AS s - -if both tables Object and Source have a column called "id". - -A workaround would be to select columns explicitly and alias them, e.g. :: - - SELECT o.id AS oId, s.id AS sId - FROM Object AS o, - Source AS s - - -"USE INDEX()" is not supported ------------------------------- - -Qserv will reject query with "USE INDEX" hint. - - -Variables are not supported ---------------------------- - -You can't select into a variable. For example :: - - SELECT scisql_s2CPolyToBin(...) - FROM T - INTO @poly - -will fail. Related story `DM-2874 `_. - -User Defined Functions -====================== - -Qserv installation always comes with a set of predefined user defined functions: - * spherical geometry aimed to allow quick answers to the following sorts of questions: * Which points in a - table lie inside a region on the sphere? For example, an astronomer might wish to know which stars and - galaxies lie inside the region of the sky observed by a single camera CCD. * Which spherical regions in a - table contain a particular point? For example, an astronomer might with to know which telescope images - overlap the position of interesting object X - * photometry, aimed to provide conversions between raw fluxes, calibrated (AB) fluxes and AB magnitudes. - -For details, see `Science Tools for MySQL `_. - - -Example Queries Supported -========================= - - -Counts and simple selections ----------------------------- - -You can count objects and run simple selections. Few examples: - - -Count the number of rows in a table -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT COUNT(*) - FROM Object - - -Find rows with a particular id -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT * - FROM Object - WHERE objectId = - - -Select rows in a given area -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT objectId - FROM Object - WHERE qserv_areaspec_box(1, 35, 2, 38) - - -Select rows in a given area meeting certain criteria -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT COUNT(*) - FROM Object - WHERE qserv_areaspec_box(0.1, -6, 4, 6) - AND x = 3.4 - AND y BETWEEN 1 AND 2 - - -Find a row with a particular id -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT * - FROM Object - WHERE objectId = - - -Joins ------ - -Join two tables -^^^^^^^^^^^^^^^ -:: - - SELECT s.ra, s.decl, o.raRange, o.declRange - FROM Object o, - Source s - WHERE o.objectId = - AND o.objectId = s.objectId - -or :: - - SELECT s.ra, s.decl, o.raRange, o.declRange - FROM Object o, - Source s USING (objectId - WHERE o.objectId = - - -Find near neighbors in a given region -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT o1.objectId AS objId1, - o2.objectId AS objId2, - scisql_angSep(o1.ra_PS, o1.decl_PS, o2.ra_PS, o2.decl_PS) AS distance - FROM Object o1, - Object o2 - WHERE qserv_areaspec_box(0, 0, 0.2, 1) - AND scisql_angSep(o1.ra_PS, o1.decl_PS, o2.ra_PS, o2.decl_PS) < 0.05 - AND o1.objectId <> o2.objectId - - -LIMIT, ORDER BY ---------------- - -Limit results, sort results -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:: - - SELECT * - FROM Object - WHERE x > 4 - ORDER BY x - LIMIT 100 - - -Known Bugs -========== - -The list of all known / reported problems can be found at: `Data Access and Database Team User-facing -Bugs `_. - - -Selecting by objectId can miss a row ------------------------------------- - -Selecting rows using objectId sometimes does not return rows it should. -For details, see: `DM-2864 `_. - - -WHERE objectId BETWEEN fails ----------------------------- - -As explained above, queries in the form "WHERE objectId BETWEEN" are discouraged. In fact, Qserv will -currently return a cryptic message when such query is executed. For details, see -`DM-2873 `_. - - -Notes of Performance -==================== - -Use objectId when selecting sources ------------------------------------ - -If you need to locate a small number of sources, try to use objectId if you can. If you don't, your query will -require an index scan for every chunk of the Source table (which can potentially mean thousands of -chunk-queries). For example this query will require it:: - - SELECT * FROM Source WHERE sourceId = 500 - -but asking for sources related to a given object, like this one:: - - SELECT * FROM Source WHERE objectId = 123 AND sourceId = 500 - -will require an index scan for just a single chunk, and thus will typically be much faster. + sql + async + http-frontend diff --git a/doc/user/sql.rst b/doc/user/sql.rst new file mode 100644 index 0000000000..4faa348d20 --- /dev/null +++ b/doc/user/sql.rst @@ -0,0 +1,534 @@ + +.. highlight:: sql + +################# +Qserv SQL Dialect +################# + +Introduction +============ + +LSST Query Services (Qserv) provides access to the LSST Database Catalogs. Users can query the catalogs using +standard SQL query language with a few restrictions described below. Why restricting it? We are intercepting +all queries, rewriting them and executing each large query as many sub-queries in parallel. Introducing these +restrictions greatly simplifies parsing incoming queries. + +Our parser uses an open-source grammar built from the `SQL92 `_ specification. +It does not include anything beyond SQL92, with minor exceptions (e.g., ``!=`` is allowed as well as ``<>``). + +The simplest way to look at it is to treat it as a MySQL database server, modulo restrictions and extensions +described in this document. + +If you run into any syntax that this Manual fails to document, please report it to the Qserv developers. + +Selected Design Aspects +======================= + +Partitioning and Sharding +------------------------- + +Qserv is designed to handle large volumes of data that can be partitioned in one or more dimensions (e.g., by spatial locality). +Once the partitioning columns and parameters (such as partition size) are chosen, the data is directed to appropriate +partitions ("chunks"), which are then distributed (sharded) across the nodes in the cluster. + +In this scheme, a single large table typically consists of many chunks (potentially hundreds of thousands). This facilitates +running full-table-scan queries, as each query can be executed in parallel across multiple chunks. For example, if we have +an Object table split into ``12345`` chunks, Qserv will execute: + +.. code-block:: sql + + SELECT * FROM Object + +as ``12345`` worker-side queries processed in parallel: + +.. code-block:: sql + + SELECT * FROM Object_00001 + SELECT * FROM Object_00002 + ... + SELECT * FROM Object_12345 + + +Director Table +-------------- + +Often, multiple large tables need to be partitioned and joined together. To ensure joining such tables is +possible without sending lots of information between nodes, Qserv has a way to ensure all related chunks +always end up on the same machine. To enable that, Qserv has a notion of a "Director Table", which "drives" +partitioning. For example, consider two tables: + +.. code-block:: sql + + CREATE TABLE Object ( + objectId BIGINT PRIMARY KEY, -- unique identifier + ra DOUBLE, -- spatial location (right ascension) + decl DOUBLE -- spatial location (declination) + ) + +which contains information about astronomical objects (galaxies, stars), one row = one object, and: + +.. code-block:: sql + + CREATE TABLE Source ( + sourceId BIGINT PRIMARY KEY, -- unique identifier + objectId BIGINT, -- pointer to corresponding object + -- Note, there maybe many sources per object + ra DOUBLE -- spatial location (right ascension) + decl DOUBLE -- spatial location (declination) + ) + +which contains information about individual detections of astronomical objects, one row representing one detection of one object. + +Note that astronomical objects tend to move, so individual detections of the same object might have different ``ra`` / ``decl`` +positions than the "average" location represented by the ra/decl of their corresponding object. + +If we select the Object table to be the "Director Table", not only will the Object table be partitioned according to +its ``ra`` / ``decl`` values, but more importantly, the Source table will be partitioned based on the ``ra`` / ``decl`` +of corresponding objects. + + +The Director Index +------------------ + +The sharding scheme described above has a problem with locating data by objectId. To alleviate this, Qserv maintains a specialized +index that maps a primary key of the director table to a chunkId of the chunk that contains a given row. Consider the query: + +.. code-block:: sql + + SELECT * FROM Object WHERE objectId = + +behind the scene, it will be executed as: + +.. code-block:: sql + + SELECT chunkId FROM IdToChunkMapping WHERE objectId = + +which is a quick index lookup, followed by: + +.. code-block:: sql + + SELECT * FROM Object_ WHERE objectId = + +which is another quick index lookup inside one small chunk. + +Note that the use of the *director* index has some restrictions, as explained in the restrictions section below. + +By the way, do not attempt to issue queries directly on our internal chunk tables. It is blocked. + + +Extensions +========== + +This section covers extensions to sql which we introduced. + +Spatial Constraints +------------------- + +Spatial constraints in Qserv can be expressed using one of the functions we introduced. Currently supported: + +.. code-block:: sql + + qserv_areaspec_box( + lonMin DOUBLE, -- [deg] Minimum longitude angle + latMin DOUBLE, -- [deg] Minimum latitude angle + lonMax DOUBLE, -- [deg] Maximum longitude angle + latMax DOUBLE -- [deg] Maximum latitude angle + ) + +.. code-block:: sql + + qserv_areaspec_circle( + lon DOUBLE, -- [deg] Circle center longitude + lat DOUBLE, -- [deg] Circle center latitude + radius DOUBLE -- [deg] Circle radius + ) + +.. code-block:: sql + + qserv_areaspec_ellipse( + lon DOUBLE, -- [deg] Ellipse center longitude + lat DOUBLE, -- [deg] Ellipse center latitude + semiMajorAxisAngle DOUBLE, -- [arcsec] Semi-major axis length + semiMinorAxisAngle DOUBLE, -- [arcsec] Semi-minor axis length + positionAngle DOUBLE -- [deg] Ellipse position angle, east of north + ) + +.. code-block:: sql + + qserv_areaspec_poly( + v1Lon DOUBLE, -- [deg] Longitude angle of first polygon vertex + v1Lat DOUBLE, -- [deg] Latitude angle of first polygon vertex + v2Lon DOUBLE, -- [deg] Longitude angle of second polygon vertex + v2Lat DOUBLE, -- [deg] Latitude angle of second polygon vertex + ... + ) + +Example: + +.. code-block:: sql + + SELECT objectId FROM Object WHERE qserv_areaspec_box(0, 0, 3, 10) + +Note that as discussed in the "Restrictions" section below, spatial constraints **must** be expressed through +the ``qserv_areaspec_*`` functions. + + +Restrictions +============ + +This section covers restriction you need to be aware of when interacting with Qserv. + +Spatial constraints should be expressed through our ``qserv_areaspec_*`` functions +---------------------------------------------------------------------------------- + +Spatial constraints should be expressed through ``qserv_areaspec_*`` functions (see Extensions section above for +details). Any other way of specifying spatial restrictions may be significantly slower (e.g., they might +devolve to be full table scan). For example, the form: + +.. code-block:: sql + + WHERE ra BETWEEN AND + AND decl BETWEEN AND + +even though it is equivalent to: + +.. code-block:: sql + + qserv_areaspec_box(, , , ) + +should not be used. + + +Spatial constraints must appear at the beginning of ``WHERE`` +------------------------------------------------------------- + +Spatial constraints must appear at the very beginning of the ``WHERE`` clause (before or after the ``objectId`` +constraint, if there is any). + + +Only one spatial constraint is allowed per query +------------------------------------------------ + +Only one spatial constraint expressed through ``qserv_areaspec_*`` is allowed per query, e.g., these are examples +of invalid queries: + +.. code-block:: sql + + WHERE qserv_areaspec_box(1, 35, 2, 38) + AND qserv_areaspec_box(5, 77, 6, 78) + + WHERE qserv_areaspec_box(1, 35, 2, 38) + AND qserv_areaspec_circle(5, 77, 0.1) + + +Arguments passed to spatial constraints functions must be simple literals +------------------------------------------------------------------------- + +The arguments passed to the ``qserv_aresspec_*`` functions must be simple literals. They may not contain any +references, e.g. may not refer to columns. + +Example of an invalid entry: + +.. code-block:: sql + + WHERE qserv_areaspec_box(3+4, ra*2, 0, 0) + + +``OR`` is not allowed after ``qserv_areaspec_*`` constraint +----------------------------------------------------------- + +If the query has extra constraints after the ``qserv_areaspec_*`` constraint, ``OR`` is not allowed immediately after +``qserv_areaspec_*,`` for example: + +.. code-block:: sql + + SELECT objectId, ra, decl, x + FROM Object + WHERE qserv_areaspec_box(1, 35, 2, 38) AND x > 3.5 + +is valid, but: + +.. code-block:: sql + + SELECT objectId, ra, decl, x + FROM Object + WHERE qserv_areaspec_box(1, 35, 2, 38) OR x > 3.5 + +is not allowed. We expect to remove this restriction in the future, see +`DM-2888 `_. + + +The director index constraint must be expressed through ``=``, ``IN``, or ``BETWEEN`` +------------------------------------------------------------------------------------- + +If the query has objectId constraint, it should be expressed in one of these three forms: + +.. code-block:: sql + + SELECT * FROM Object WHERE objectId = 123 + SELECT * FROM Object WHERE objectId IN (123, 453, 3465) + SELECT * FROM Object WHERE objectId BETWEEN 123 AND 130 + +E.g., don't try to express it as ``WHERE objectId != 1``, or ``WHERE objectId > 123``, etc. + +Note, we expect to allow decomposing objectId into bitfields (e.g., for sampling) in the future. See +`DM-2889 `_. + + +Column(s) used in ``ORDER BY`` or ``GROUP BY`` must appear in ``SELECT`` +------------------------------------------------------------------------ + +At the moment we require columns used in ``ORDER BY`` or ``GROUP BY`` to be listed in ``SELECT``. +Example of an invalid query: + +.. code-block:: sql + + SELECT x FROM T ORDER BY y + +Correct version: + +.. code-block:: sql + + SELECT y, x FROM T ORDER BY y + +Expressions/functions in ``ORDER BY`` clauses are not allowed +------------------------------------------------------------- + +In SQL92 ``ORDER BY`` is limited to actual table columns, thus expressions or functions in ``ORDER BY`` are rejected. +This is true for Qserv too. + +Example of an invalid use of the ``ORDER BY`` clause: + +.. code-block:: sql + + SELECT id, ABS(x) FROM Source ORDER BY ABS(x) + +However, one can bypass this by using an alias, for example: + +.. code-block:: sql + + SELECT id, ABS(x) as ax FROM Source ORDER BY ax + +Sub-queries are NOT supported +----------------------------- + +Sub queries are not supported. + + +Commands that modify tables are disallowed +------------------------------------------ + +Commands for creating or modifying tables are disabled. These commands include ``INSERT``, ``UPDATE``, ``LOAD INTO``, +``CREATE``, ``ALTER``, ``TRUNCATE``, and ``DROP``. We will revisit this as we start adding support for Level 3. + + +Outer joins are not supported with near-neighbor queries +-------------------------------------------------------- + +Qserv does not support ``LEFT`` or ``RIGHT`` joins with near-neighbor predicates. + + +MySQL-specific syntax is not supported +-------------------------------------- + +MySQL-specific syntax is not supported. Example of unsupported syntax that will be rejected: ``NAME_CONST``. + + +Repeated column names through ``*`` are not supported +----------------------------------------------------- + +Queries with a ``*`` that resolves to repeated column name are not supported. Example: + +.. code-block:: sql + + SELECT *, id FROM Object + +will fail if the table Object has a column called ``id``. Similarly, this query will fail: + +.. code-block:: sql + + SELECT o.*, s.* FROM Object AS o, Source AS s + +if both tables Object and Source have a column called ``id``. + +A workaround would be to select columns explicitly and alias them, e.g. : + +.. code-block:: sql + + SELECT o.id AS oId, s.id AS sId FROM Object AS o, Source AS s + +``USE INDEX()`` is not supported +-------------------------------- + +Qserv will reject query with ``USE INDEX`` hint. + +Variables are not supported +--------------------------- + +You can't select into a variable. For example: + +.. code-block:: sql + + SELECT scisql_s2CPolyToBin(...) FROM T INTO @poly + +will fail. Related story is at `DM-2874 `_. + +User Defined Functions +====================== + +Qserv installation always comes with a set of predefined user defined functions: + +- spherical geometry aimed to allow quick answers to the following sorts of questions: + + - Which points in a table lie inside a region on the sphere? For example, an astronomer might wish to know which stars and + galaxies lie inside the region of the sky observed by a single camera CCD. + - Which spherical regions in a table contain a particular point? For example, an astronomer might wish to know which telescope images + overlap the position of interesting object X + +- photometry, aimed to provide conversions between raw fluxes, calibrated (AB) fluxes and AB magnitudes. + +For details, see `Science Tools for MySQL `_. + + +Example Queries Supported +========================= + +Counts and simple selections +---------------------------- + +You can count objects and run simple selections. Few examples: + + +Count the number of rows in a table +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT COUNT(*) FROM Object + + +Find rows with a particular ``id`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT * FROM Object WHERE objectId = + +Select rows in a given area +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT objectId FROM Object + WHERE qserv_areaspec_box(1, 35, 2, 38) + + +Select rows in a given area meeting certain criteria +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT COUNT(*) FROM Object + WHERE qserv_areaspec_box(0.1, -6, 4, 6) + AND x = 3.4 + AND y BETWEEN 1 AND 2 + +Joins +----- + +Join two tables +^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT s.ra, s.decl, o.raRange, o.declRange + FROM Object o, Source s + WHERE o.objectId = + AND o.objectId = s.objectId + +or: + +.. code-block:: sql + + SELECT s.ra, s.decl, o.raRange, o.declRange + FROM Object o, Source s USING (objectId) + WHERE o.objectId = + + +Find near neighbors in a given region +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT o1.objectId AS objId1, + o2.objectId AS objId2, + scisql_angSep(o1.ra_PS, + o1.decl_PS, + o2.ra_PS, + o2.decl_PS) AS distance + FROM Object o1, Object o2 + WHERE qserv_areaspec_box(0, 0, 0.2, 1) + AND scisql_angSep(o1.ra_PS, + o1.decl_PS, + o2.ra_PS, + o2.decl_PS) < 0.05 + AND o1.objectId <> o2.objectId + + +``LIMIT``, ``ORDER BY`` +----------------------- + +Limit results, sort results +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: sql + + SELECT * FROM Object + WHERE x > 4 + ORDER BY x + LIMIT 100 + + +Known Bugs +========== + +The list of all known / reported problems can be found at: `Data Access and Database Team User-facing +Bugs `_. + + +Selecting by objectId can miss a row +------------------------------------ + +Selecting rows using objectId sometimes does not return rows it should. +For details, see: `DM-2864 `_. + + +``WHERE objectId BETWEEN`` fails +-------------------------------- + +As explained above, queries in the form ``WHERE objectId BETWEEN`` are discouraged. In fact, Qserv will +currently return a cryptic message when such query is executed. For details, see +`DM-2873 `_. + + +Notes of Performance +==================== + +Use objectId when selecting sources +----------------------------------- + +If you need to locate a small number of sources, try to use objectId if you can. If you don't, your query will +require an index scan for every chunk of the Source table (which can potentially mean thousands of +chunk-queries). For example this query will require it: + +.. code-block:: sql + + SELECT * FROM Source WHERE sourceId = 500 + +but asking for sources related to a given object, like this one: + +.. code-block:: sql + + SELECT * FROM Source WHERE objectId = 123 AND sourceId = 500 + +will require an index scan for just a single chunk, and thus will typically be much faster.