duckdb · c-herrewijn · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/.github/workflows/update-docs.yml b/.github/workflows/update-docs.yml
@@ -15,7 +15,7 @@ jobs:
           token: ${{ secrets.GH_UPDATE_DOCS_TOKEN }}
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.12'
           cache: 'pip'
       - run: npm install jsdoc-to-markdown
       - run: pip install wheel pip -U --break-system-packages

diff --git a/docs/sql/functions/blob.md b/docs/sql/functions/blob.md
@@ -9,16 +9,25 @@ redirect_from:
 
 This section describes functions and operators for examining and manipulating [`BLOB` values]({% link docs/sql/data_types/blob.md %}).
 
+<!-- Start of section generated by scripts/generate_sql_function_docs.py -->
 <!-- markdownlint-disable MD056 -->
 
 | Name | Description |
 |:--|:-------|
 | [`blob || blob`](#blob--blob) | `BLOB` concatenation. |
+| [`base64(blob)`](#base64blob) | Converts a `blob` to a base64 encoded `string`. |
 | [`decode(blob)`](#decodeblob) | Converts `blob` to `VARCHAR`. Fails if `blob` is not valid UTF-8. |
 | [`encode(string)`](#encodestring) | Converts the `string` to `BLOB`. Converts UTF-8 characters into literal encoding. |
+| [`from_base64(string)`](#from_base64string) | Converts a base64 encoded `string` to a character string (`BLOB`). |
+| [`from_binary(value)`](#from_binaryvalue) | Converts a value from binary representation to a blob. |
+| [`from_hex(value)`](#from_hexvalue) | Converts a value from hexadecimal representation to a blob. |
 | [`hex(blob)`](#hexblob) | Converts `blob` to `VARCHAR` using hexadecimal encoding. |
 | [`octet_length(blob)`](#octet_lengthblob) | Number of bytes in `blob`. |
 | [`read_blob(source)`](#read_blobsource) | Returns the content from `source` (a filename, a list of filenames, or a glob pattern) as a `BLOB`. See the [`read_blob` guide]({% link docs/guides/file_formats/read_file.md %}#read_blob) for more details. |
+| [`to_base64(blob)`](#to_base64blob) | Converts a `blob` to a base64 encoded `string`. |
+| [`to_hex(blob)`](#to_hexblob) | Converts `blob` to `VARCHAR` using hexadecimal encoding. |
+| [`unbin(value)`](#unbinvalue) | Converts a value from binary representation to a blob. |
+| [`unhex(value)`](#unhexvalue) | Converts a value from hexadecimal representation to a blob. |
 
 <!-- markdownlint-enable MD056 -->
 
@@ -30,11 +39,19 @@ This section describes functions and operators for examining and manipulating [`
 | **Example** | `'\xAA'::BLOB || '\xBB'::BLOB` |
 | **Result** | `\xAA\xBB` |
 
+#### `base64(blob)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a `blob` to a base64 encoded `string`. |
+| **Example** | `base64('A'::BLOB)` |
+| **Result** | `QQ==` |
+
 #### `decode(blob)`
 
 <div class="nostroke_table"></div>
 
-| **Description** | Convert `blob` to `VARCHAR`. Fails if `blob` is not valid UTF-8. |
+| **Description** | Converts `blob` to `VARCHAR`. Fails if `blob` is not valid UTF-8. |
 | **Example** | `decode('\xC3\xBC'::BLOB)` |
 | **Result** | `ü` |
 
@@ -46,6 +63,30 @@ This section describes functions and operators for examining and manipulating [`
 | **Example** | `encode('my_string_with_ü')` |
 | **Result** | `my_string_with_\xC3\xBC` |
 
+#### `from_base64(string)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a base64 encoded `string` to a character string (`BLOB`). |
+| **Example** | `from_base64('QQ==')` |
+| **Result** | `A` |
+
+#### `from_binary(value)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a value from binary representation to a blob. |
+| **Example** | `unbin('0110')` |
+| **Result** | `\x06` |
+
+#### `from_hex(value)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a value from hexadecimal representation to a blob. |
+| **Example** | `unhex('2A')` |
+| **Result** | `*` |
+
 #### `hex(blob)`
 
 <div class="nostroke_table"></div>
@@ -69,3 +110,37 @@ This section describes functions and operators for examining and manipulating [`
 | **Description** | Returns the content from `source` (a filename, a list of filenames, or a glob pattern) as a `BLOB`. See the [`read_blob` guide]({% link docs/guides/file_formats/read_file.md %}#read_blob) for more details. |
 | **Example** | `read_blob('hello.bin')` |
 | **Result** | `hello\x0A` |
+
+#### `to_base64(blob)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a `blob` to a base64 encoded `string`. |
+| **Example** | `base64('A'::BLOB)` |
+| **Result** | `QQ==` |
+
+#### `to_hex(blob)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts `blob` to `VARCHAR` using hexadecimal encoding. |
+| **Example** | `hex('\xAA\xBB'::BLOB)` |
+| **Result** | `AABB` |
+
+#### `unbin(value)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a value from binary representation to a blob. |
+| **Example** | `unbin('0110')` |
+| **Result** | `\x06` |
+
+#### `unhex(value)`
+
+<div class="nostroke_table"></div>
+
+| **Description** | Converts a value from hexadecimal representation to a blob. |
+| **Example** | `unhex('2A')` |
+| **Result** | `*` |
+
+<!-- End of section generated by scripts/generate_sql_function_docs.py -->
diff --git a/scripts/generate_all_docs.sh b/scripts/generate_all_docs.sh
@@ -13,5 +13,6 @@ echo "Generating docs using duckdb source in $DUCKDB"
 python3 ./scripts/generate_config_docs.py $DUCKDB/build/release/duckdb
 python3 ./scripts/generate_c_api_docs.py $DUCKDB
 python3 ./scripts/generate_python_docs.py
+python3 ./scripts/generate_sql_function_docs.py $DUCKDB/build/release/duckdb
 node ./scripts/generate_nodejs_docs.js $DUCKDB/../duckdb-node
 python3 ./scripts/generate_function_json.py --source $DUCKDB --binary $DUCKDB/build/release/duckdb
diff --git a/scripts/generate_sql_function_docs.py b/scripts/generate_sql_function_docs.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+
+import duckdb
+
+DOC_CATEGORY_MAP = {'docs/sql/functions/blob.md': 'blob'}
+
+# 'functions' that are binary operators are listed between the arguments
+BINARY_OPERATORS = ['||']
+
+# override/add to duckdb_functions() outputs:
+# - key: tuple: (category, function_name)
+# - value: tuple: (parameters, description, examples)
+OVERRIDES_MAP = {
+    ('blob', '||'): (
+        ['blob', 'blob'],
+        '`BLOB` concatenation.',
+        [r"'\xAA'::BLOB || '\xBB'::BLOB"],
+    ),
+    ('blob', 'read_blob'): (
+        ['source'],
+        'Returns the content from `source` (a filename, a list of filenames, or a glob pattern) as a `BLOB`. See the `read_blob` guide for more details.',
+        ["read_blob('hello.bin')"],
+    ),
+}
+
+URL_CONVERSIONS = {
+    '`read_blob` guide': ('docs/guides/file_formats/read_file.md', '#read_blob')
+}
+
+# for these functions, we don't run the examples
+FIXED_EXAMPLES = {('blob', 'read_blob'): r"hello\x0A"}
+
+
+def main():
+    for doc_file, category in DOC_CATEGORY_MAP.items():
+        generate_doc_file(doc_file, category)
+
+
+def generate_doc_file(doc_file: str, category: str) -> None:
+    function_data: list[tuple[str, list[str], str, list[str]]] = get_function_data(
+        category
+    )
+    startline = (
+        "<!-- Start of section generated by scripts/generate_sql_function_docs.py -->\n"
+    )
+    endline = (
+        "<!-- End of section generated by scripts/generate_sql_function_docs.py -->\n"
+    )
+    with open(doc_file, "r") as f:
+        doc_text = f.read()
+    if startline not in doc_text or endline not in doc_text:
+        print(
+            f"doc generation failed, start or end line is missing in file " + doc_file
+        )
+        exit(1)
+    else:
+        split_start = doc_text.rsplit(startline, 1)
+        split_end = doc_text.rsplit(endline, 1)
+        doc_text_new = (
+            split_start[0]
+            + startline
+            + generate_docs_table(function_data)
+            + generate_docs_records(function_data, category)
+            + endline
+            + split_end[1]
+        )
+        with open(doc_file, "w+") as f:
+            f.write(doc_text_new)
+
+
+def get_function_data(category: str) -> list[tuple[str, list[str], str, list[str]]]:
+    query = f"""
+select
+    function_name,
+    parameters,
+    description,
+    examples,
+from
+    duckdb_functions()
+where
+    list_contains(categories, '{category}')
+order by
+    function_name
+;
+"""
+    function_data: list[tuple[str, list[str], str, list[str]]] = duckdb.sql(
+        query
+    ).fetchall()
+
+    # apply overrides and add additional functions
+    all_function_dict = {func[0]: idx for idx, func in enumerate(function_data)}
+    for override_category, function_name in OVERRIDES_MAP:
+        if override_category == category:
+            params, description, examples = OVERRIDES_MAP[(category, function_name)]
+            if function_name in all_function_dict:
+                function_data[all_function_dict[function_name]] = (
+                    function_name,
+                    params,
+                    description,
+                    examples,
+                )
+            else:
+                function_data.append((function_name, params, description, examples))
+    function_data.sort()
+
+    # rotate non-alphanumeric functions (i.e. operators) from bottom to top
+    # (bit crude, because i don't want to add pip install icu dependency)
+    idx = len(function_data) - 1
+    operator_count = 0
+    while idx >= 0:
+        if not function_data[idx][0][0].isalnum():
+            operator_count += 1
+        else:
+            break
+        idx -= 1
+    function_data = (
+        function_data[-1 * operator_count :] + function_data[: -1 * operator_count]
+    )
+
+    # apply url conversions
+    for conversion in URL_CONVERSIONS:
+        for idx, function in enumerate(function_data):
+            if conversion in function[2]:
+                url_desc = function[2].replace(
+                    conversion,
+                    f"[{conversion}]"
+                    "({% "
+                    f"link {URL_CONVERSIONS[conversion][0]}"
+                    " %}"
+                    f"{URL_CONVERSIONS[conversion][1]})",
+                )
+                function_name, parameters, _, examples = function_data[idx]
+                function_data[idx] = (function_name, parameters, url_desc, examples)
+    return function_data
+
+
+def generate_docs_table(function_data: list[tuple[str, list[str], str, list[str]]]):
+    res = "<!-- markdownlint-disable MD056 -->\n\n"
+    res += "| Name | Description |\n|:--|:-------|\n"
+    for func in function_data:
+        function_name, params, description, _ = func
+        if function_name in BINARY_OPERATORS and len(params) == 2:
+            res += f"| [`{params[0]} {function_name} {params[1]}`](#{params[0]}--{params[1]}) | {description} |\n"
+        else:
+            res += f"| [`{function_name}({", ".join(params)})`](#{function_name.lstrip('@*!^')}{"-".join(params)}) | {description} |\n"
+    res += "\n<!-- markdownlint-enable MD056 -->\n"
+    return res
+
+
+def generate_docs_records(
+    function_data: list[tuple[str, list[str], str, list[str]]], category: str
+):
+    res = "\n"
+    for func in function_data:
+        function_name, params, description, examples = func
+        if function_name in BINARY_OPERATORS and len(params) == 2:
+            res += f"#### `{params[0]} {function_name} {params[1]}`\n\n"
+        else:
+            res += f"#### `{function_name}({", ".join(params)})`\n\n"
+        res += '<div class="nostroke_table"></div>\n\n'
+        res += f"| **Description** | {description} |\n"
+        res += f"| **Example** | `{examples[0]}` |\n"
+        if (category, function_name) in FIXED_EXAMPLES:
+            res += f"| **Result** | `{FIXED_EXAMPLES[(category, function_name)]}` |\n\n"
+        else:
+            res += f"| **Result** | `{duckdb.sql(rf"select {examples[0]}::VARCHAR").fetchone()[0]}` |\n\n"
+    return res
+
+
+if __name__ == "__main__":
+    main()