From 788fceba2115fa2fca79eaba2b3d767f20ec1576 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 19:44:03 +0100 Subject: [PATCH 01/13] cleanup drafts --- definitions/scripts/crawl_cleanup.js | 144 +++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 definitions/scripts/crawl_cleanup.js diff --git a/definitions/scripts/crawl_cleanup.js b/definitions/scripts/crawl_cleanup.js new file mode 100644 index 00000000..9cb035a5 --- /dev/null +++ b/definitions/scripts/crawl_cleanup.js @@ -0,0 +1,144 @@ +const month = constants.currentMonth + +// Restore categories for valid technologies +operate('crawl_page_categories_clean').queries(ctx => ` +WITH wappalyzer AS ( + SELECT + name as technology, + categories + FROM ${ctx.ref('wappalyzer', 'technologies')} +), pages AS ( + SELECT + client, + page, + tech.technology, + tech.categories, + tech.info + FROM ${ctx.ref('crawl', 'pages')} + LEFT JOIN pages.technologies AS tech + WHERE date = '${month}' + ${constants.devRankFilter} +), impacted_pages AS ( + SELECT DISTINCT + client, + page + FROM pages + WHERE + technology IS NOT NULL AND + ARRAY_LENGTH(categories) = 0 +), repaired AS ( + SELECT + client, + page, + ARRAY_AGG(STRUCT( + pages.technology, + wappalyzer.categories, + pages.info + )) AS technologies + FROM pages + INNER JOIN impacted_pages + USING (client, page) + LEFT JOIN wappalyzer + ON pages.technology = wappalyzer.technology + GROUP BY 1,2 +) + +SELECT + client, + page, + technologies +FROM repaired +`) + +// Cleanup corrupted technologies +operate('crawl_page_categories_clean').queries(ctx => ` +CREATE TEMP TABLE technologies_cleaned AS ( + WITH wappalyzer AS ( + SELECT + name as technology, + category + FROM ${ctx.ref('wappalyzer', 'apps')} + LEFT JOIN apps.categories AS category + ), pages AS ( + SELECT + date, + client, + page, + technologies + FROM ${ctx.ref('crawl', 'pages')} + WHERE date = '${month}' + ), impacted_pages AS ( + SELECT DISTINCT + date, + client, + page + FROM pages + LEFT JOIN pages.technologies AS tech, + LEFT JOIN tech.categories AS category + LEFT JOIN wappalyzer + USING (technology, category) + WHERE wappalyzer.technology IS NULL + ), flattened_technologies AS ( + SELECT + date, + client, + page, + technology, + category, + info + FROM pages + LEFT JOIN pages.technologies AS tech + LEFT JOIN tech.categories AS category + WHERE page IN (SELECT DISTINCT page FROM impacted_pages) + ), whitelisted_technologies AS ( + SELECT + date, + client, + page, + f.technology, + f.category, + f.info + FROM flattened_technologies f + INNER JOIN wappalyzer + USING (technology, category) + ), reconstructed_technologies AS ( + SELECT + date, + client, + page, + ARRAY_AGG(STRUCT( + technology, + categories, + info + )) AS technologies + FROM ( + SELECT + date, + client, + page, + technology, + ARRAY_AGG(DISTINCT category IGNORE NULLS) AS categories, + info + FROM whitelisted_technologies + GROUP BY date, client, page, technology, info + ) + GROUP BY date, client, page + ) + + SELECT + date, + client, + page, + r.technologies + FROM impacted_pages + LEFT JOIN reconstructed_technologies r + USING (date, client, page) +); + +UPDATE crawl.pages +SET technologies = technologies_cleaned.technologies +FROM technologies_cleaned +WHERE pages.date = crawl_month AND + pages.client = technologies_cleaned.client AND + pages.page = technologies_cleaned.page; +`) From 8d5d3eb1c749b2366d12870089b1e6364ad8a310 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 20:42:58 +0100 Subject: [PATCH 02/13] aggregated query --- definitions/scripts/crawl_cleanup.js | 161 +++++++++------------------ 1 file changed, 50 insertions(+), 111 deletions(-) diff --git a/definitions/scripts/crawl_cleanup.js b/definitions/scripts/crawl_cleanup.js index 9cb035a5..06aa12c1 100644 --- a/definitions/scripts/crawl_cleanup.js +++ b/definitions/scripts/crawl_cleanup.js @@ -1,144 +1,83 @@ -const month = constants.currentMonth +const issue_month = constants.currentMonth -// Restore categories for valid technologies -operate('crawl_page_categories_clean').queries(ctx => ` -WITH wappalyzer AS ( - SELECT - name as technology, - categories - FROM ${ctx.ref('wappalyzer', 'technologies')} -), pages AS ( - SELECT - client, - page, - tech.technology, - tech.categories, - tech.info - FROM ${ctx.ref('crawl', 'pages')} - LEFT JOIN pages.technologies AS tech - WHERE date = '${month}' - ${constants.devRankFilter} -), impacted_pages AS ( - SELECT DISTINCT - client, - page - FROM pages - WHERE - technology IS NOT NULL AND - ARRAY_LENGTH(categories) = 0 -), repaired AS ( - SELECT - client, - page, - ARRAY_AGG(STRUCT( - pages.technology, - wappalyzer.categories, - pages.info - )) AS technologies - FROM pages - INNER JOIN impacted_pages - USING (client, page) - LEFT JOIN wappalyzer - ON pages.technology = wappalyzer.technology - GROUP BY 1,2 -) - -SELECT - client, - page, - technologies -FROM repaired -`) - -// Cleanup corrupted technologies -operate('crawl_page_categories_clean').queries(ctx => ` -CREATE TEMP TABLE technologies_cleaned AS ( +operate('crawl_page_technologies_patch').queries(ctx => ` +CREATE TEMP TABLE crawl_page_technologies_patch AS ( WITH wappalyzer AS ( SELECT - name as technology, - category - FROM ${ctx.ref('wappalyzer', 'apps')} - LEFT JOIN apps.categories AS category + name AS technology, + categories + FROM ${ctx.ref('wappalyzer', 'technologies')} ), pages AS ( SELECT - date, client, page, - technologies - FROM ${ctx.ref('crawl', 'pages')} - WHERE date = '${month}' - ), impacted_pages AS ( + tech.technology, + tech.categories, + tech.info + FROM ${ctx.ref('crawl', 'pages')} AS pages + LEFT JOIN pages.technologies AS tech + WHERE date = '${issue_month}' ${constants.devRankFilter} + ), + + -- Identify impacted pages + impacted_pages AS ( SELECT DISTINCT - date, client, page FROM pages - LEFT JOIN pages.technologies AS tech, - LEFT JOIN tech.categories AS category - LEFT JOIN wappalyzer - USING (technology, category) - WHERE wappalyzer.technology IS NULL - ), flattened_technologies AS ( + WHERE + -- Categories are empty OR the technology is corrupted + (technology IS NOT NULL AND ARRAY_LENGTH(categories) = 0) + OR technology NOT IN (SELECT technology FROM wappalyzer) + ), + + -- Flatten technologies for reconstruction + flattened_technologies AS ( SELECT - date, client, page, technology, - category, + categories, info FROM pages - LEFT JOIN pages.technologies AS tech - LEFT JOIN tech.categories AS category WHERE page IN (SELECT DISTINCT page FROM impacted_pages) - ), whitelisted_technologies AS ( - SELECT - date, - client, - page, - f.technology, - f.category, - f.info - FROM flattened_technologies f - INNER JOIN wappalyzer - USING (technology, category) - ), reconstructed_technologies AS ( + ), + + -- Reconstruct valid technologies + reconstructed_technologies AS ( SELECT - date, client, page, ARRAY_AGG(STRUCT( - technology, - categories, - info + f.technology, + -- Use the categories from Wappalyzer if the categories are empty + IF(ARRAY_LENGTH(f.categories) = 0, w.categories, f.categories) AS categories, + f.info )) AS technologies - FROM ( - SELECT - date, - client, - page, - technology, - ARRAY_AGG(DISTINCT category IGNORE NULLS) AS categories, - info - FROM whitelisted_technologies - GROUP BY date, client, page, technology, info - ) - GROUP BY date, client, page + FROM flattened_technologies f + LEFT JOIN wappalyzer w + ON f.technology = w.technology + -- Only reconstruct technologies existing in Wappalyzer + WHERE f.technology IN (SELECT technology FROM wappalyzer) + GROUP BY + client, + page ) SELECT - date, client, page, - r.technologies + technologies FROM impacted_pages - LEFT JOIN reconstructed_technologies r - USING (date, client, page) + LEFT JOIN reconstructed_technologies + USING (client, page) ); +-- Update the crawl.pages table with the cleaned and restored technologies UPDATE crawl.pages -SET technologies = technologies_cleaned.technologies -FROM technologies_cleaned -WHERE pages.date = crawl_month AND - pages.client = technologies_cleaned.client AND - pages.page = technologies_cleaned.page; +SET technologies = crawl_page_technologies_patch.technologies +FROM crawl_page_technologies_patch +WHERE pages.date = '${issue_month}' AND + pages.client = crawl_page_technologies_patch.client AND + pages.page = crawl_page_technologies_patch.page; `) From 42bf6fa4b22374b404bca8e056828bbd55e352b8 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 20:53:32 +0100 Subject: [PATCH 03/13] crawl pipeline patch --- definitions/output/crawl/pages.js | 95 ++++++++++++++++++++++++++-- definitions/scripts/crawl_cleanup.js | 83 ------------------------ 2 files changed, 90 insertions(+), 88 deletions(-) delete mode 100644 definitions/scripts/crawl_cleanup.js diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 4a9c1d5c..f7f84c33 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -52,23 +52,108 @@ publish('pages', { DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}' AND client = 'desktop'; -`).query(ctx => ` + +INSERT INTO ${ctx.self()} SELECT * FROM ${ctx.ref('crawl_staging', 'pages')} WHERE date = '${constants.currentMonth}' AND client = 'desktop' - ${constants.devRankFilter} -`).postOps(ctx => ` + ${constants.devRankFilter}; + DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}' AND client = 'mobile'; - -INSERT INTO ${ctx.self()} +`).query(ctx => ` SELECT * FROM ${ctx.ref('crawl_staging', 'pages')} WHERE date = '${constants.currentMonth}' AND client = 'mobile' ${constants.devRankFilter} +`).postOps(ctx => ` +CREATE TEMP TABLE technologies_cleaned AS ( + WITH wappalyzer AS ( + SELECT + name AS technology, + categories AS valid_categories + FROM ${ctx.ref('wappalyzer', 'technologies')} + ), pages AS ( + SELECT + client, + page, + tech.technology, + tech.categories, + tech.info + FROM ${ctx.ref('crawl', 'pages')} AS pages + LEFT JOIN pages.technologies AS tech + WHERE date = '${constants.currentMonth}' ${constants.devRankFilter} + ), -- Identify impacted pages + impacted_pages AS ( + SELECT DISTINCT + client, + page + FROM pages + WHERE + -- Categories are empty OR technology is corrupted OR contains invalid categories + (technology IS NOT NULL AND ARRAY_LENGTH(categories) = 0) + OR technology NOT IN (SELECT technology FROM wappalyzer) + OR EXISTS ( + SELECT category + FROM UNNEST(categories) AS category + WHERE category NOT IN (SELECT DISTINCT UNNEST(valid_categories) FROM wappalyzer) + ) + ), -- Flatten technologies for reconstruction + flattened_technologies AS ( + SELECT + client, + page, + technology, + ARRAY( + SELECT category + FROM UNNEST(categories) AS category + WHERE category IN ( + SELECT DISTINCT UNNEST(valid_categories) + FROM wappalyzer + WHERE technology = flattened_technologies.technology + ) + ) AS valid_categories, + info + FROM pages + WHERE page IN (SELECT DISTINCT page FROM impacted_pages) + ), -- Reconstruct valid technologies + reconstructed_technologies AS ( + SELECT + client, + page, + ARRAY_AGG(STRUCT( + f.technology, + IF(ARRAY_LENGTH(f.valid_categories) = 0, w.valid_categories, f.valid_categories) AS categories, + f.info + )) AS technologies + FROM flattened_technologies f + LEFT JOIN wappalyzer w + ON f.technology = w.technology + WHERE f.technology IN (SELECT technology FROM wappalyzer) + GROUP BY + client, + page + ) + + SELECT + client, + page, + r.technologies + FROM impacted_pages + LEFT JOIN reconstructed_technologies r + USING (client, page) +); + +-- Update the crawl.pages table with the cleaned and restored technologies +UPDATE crawl.pages +SET technologies = technologies_cleaned.technologies +FROM technologies_cleaned +WHERE pages.date = '${constants.currentMonth}' AND + pages.client = technologies_cleaned.client AND + pages.page = technologies_cleaned.page; `) diff --git a/definitions/scripts/crawl_cleanup.js b/definitions/scripts/crawl_cleanup.js deleted file mode 100644 index 06aa12c1..00000000 --- a/definitions/scripts/crawl_cleanup.js +++ /dev/null @@ -1,83 +0,0 @@ -const issue_month = constants.currentMonth - -operate('crawl_page_technologies_patch').queries(ctx => ` -CREATE TEMP TABLE crawl_page_technologies_patch AS ( - WITH wappalyzer AS ( - SELECT - name AS technology, - categories - FROM ${ctx.ref('wappalyzer', 'technologies')} - ), pages AS ( - SELECT - client, - page, - tech.technology, - tech.categories, - tech.info - FROM ${ctx.ref('crawl', 'pages')} AS pages - LEFT JOIN pages.technologies AS tech - WHERE date = '${issue_month}' ${constants.devRankFilter} - ), - - -- Identify impacted pages - impacted_pages AS ( - SELECT DISTINCT - client, - page - FROM pages - WHERE - -- Categories are empty OR the technology is corrupted - (technology IS NOT NULL AND ARRAY_LENGTH(categories) = 0) - OR technology NOT IN (SELECT technology FROM wappalyzer) - ), - - -- Flatten technologies for reconstruction - flattened_technologies AS ( - SELECT - client, - page, - technology, - categories, - info - FROM pages - WHERE page IN (SELECT DISTINCT page FROM impacted_pages) - ), - - -- Reconstruct valid technologies - reconstructed_technologies AS ( - SELECT - client, - page, - ARRAY_AGG(STRUCT( - f.technology, - -- Use the categories from Wappalyzer if the categories are empty - IF(ARRAY_LENGTH(f.categories) = 0, w.categories, f.categories) AS categories, - f.info - )) AS technologies - FROM flattened_technologies f - LEFT JOIN wappalyzer w - ON f.technology = w.technology - -- Only reconstruct technologies existing in Wappalyzer - WHERE f.technology IN (SELECT technology FROM wappalyzer) - GROUP BY - client, - page - ) - - SELECT - client, - page, - technologies - FROM impacted_pages - LEFT JOIN reconstructed_technologies - USING (client, page) -); - --- Update the crawl.pages table with the cleaned and restored technologies -UPDATE crawl.pages -SET technologies = crawl_page_technologies_patch.technologies -FROM crawl_page_technologies_patch -WHERE pages.date = '${issue_month}' AND - pages.client = crawl_page_technologies_patch.client AND - pages.page = crawl_page_technologies_patch.page; -`) From b64479e5dd13231abdeedd455e6d045d50737f6a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 22:07:31 +0100 Subject: [PATCH 04/13] staging tech assert --- definitions/declarations/httparchive.js | 46 ++++++++++++++++++------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/definitions/declarations/httparchive.js b/definitions/declarations/httparchive.js index 2265d5aa..73c58e1f 100644 --- a/definitions/declarations/httparchive.js +++ b/definitions/declarations/httparchive.js @@ -1,17 +1,39 @@ -const stagingTables = ['pages', 'requests', 'parsed_css'] -for (const table of stagingTables) { +// Tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py +['pages', 'requests', 'parsed_css'].forEach(table => declare({ - schema: 'crawl_staging', + schema: 'wappalyzer', name: table }) -} +) -declare({ - schema: 'wappalyzer', - name: 'technologies' -}) +assert('corrupted_technology_values').query(ctx => ` +SELECT + date, + client, + tech, + COUNT(DISTINCT page) AS cnt_pages, + ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages +FROM ${ctx.ref('crawl_staging', 'pages')} +LEFT JOIN pages.technologies AS tech +LEFT JOIN tech.categories AS category +WHERE + date = '${constants.currentMonth}' AND + ( + tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies) + OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories) + OR ARRAY_LENGTH(tech.categories) = 0 + ) +GROUP BY + date, + client, + tech +ORDER BY cnt DESC; +`); -declare({ - schema: 'wappalyzer', - name: 'categories' -}) +// Tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml +['technologies', 'categories'].forEach(table => + declare({ + schema: 'wappalyzer', + name: table + }) +) From 4eea827fce7231430db48e08ba6dc2bab07474ac Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 22:08:12 +0100 Subject: [PATCH 05/13] simpler category rewrite --- definitions/output/crawl/pages.js | 63 +++++++++++-------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index f7f84c33..db0f9d91 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -74,9 +74,9 @@ WHERE date = '${constants.currentMonth}' AND `).postOps(ctx => ` CREATE TEMP TABLE technologies_cleaned AS ( WITH wappalyzer AS ( - SELECT + SELECT DISTINCT name AS technology, - categories AS valid_categories + categories FROM ${ctx.ref('wappalyzer', 'technologies')} ), pages AS ( SELECT @@ -85,7 +85,7 @@ CREATE TEMP TABLE technologies_cleaned AS ( tech.technology, tech.categories, tech.info - FROM ${ctx.ref('crawl', 'pages')} AS pages + FROM ${ctx.self()} AS pages LEFT JOIN pages.technologies AS tech WHERE date = '${constants.currentMonth}' ${constants.devRankFilter} ), -- Identify impacted pages @@ -94,47 +94,32 @@ CREATE TEMP TABLE technologies_cleaned AS ( client, page FROM pages + LEFT JOIN pages.categories AS category WHERE - -- Categories are empty OR technology is corrupted OR contains invalid categories - (technology IS NOT NULL AND ARRAY_LENGTH(categories) = 0) - OR technology NOT IN (SELECT technology FROM wappalyzer) - OR EXISTS ( - SELECT category - FROM UNNEST(categories) AS category - WHERE category NOT IN (SELECT DISTINCT UNNEST(valid_categories) FROM wappalyzer) + -- Technology is corrupted + technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR + -- Technology's category is corrupted + CONCAT(technology, category) NOT IN ( + SELECT DISTINCT + CONCAT(technology, category) + FROM wappalyzer + LEFT JOIN wappalyzer.categories AS category ) - ), -- Flatten technologies for reconstruction - flattened_technologies AS ( - SELECT - client, - page, - technology, - ARRAY( - SELECT category - FROM UNNEST(categories) AS category - WHERE category IN ( - SELECT DISTINCT UNNEST(valid_categories) - FROM wappalyzer - WHERE technology = flattened_technologies.technology - ) - ) AS valid_categories, - info - FROM pages - WHERE page IN (SELECT DISTINCT page FROM impacted_pages) - ), -- Reconstruct valid technologies + ), -- Keep valid technologies and use correct categories reconstructed_technologies AS ( SELECT client, page, ARRAY_AGG(STRUCT( - f.technology, - IF(ARRAY_LENGTH(f.valid_categories) = 0, w.valid_categories, f.valid_categories) AS categories, - f.info + pages.technology, + wappalyzer.categories, + pages.info )) AS technologies - FROM flattened_technologies f - LEFT JOIN wappalyzer w - ON f.technology = w.technology - WHERE f.technology IN (SELECT technology FROM wappalyzer) + FROM pages + INNER JOIN impacted_pages + USING (client, page) + INNER JOIN wappalyzer + ON pages.technology = wappalyzer.technology GROUP BY client, page @@ -143,10 +128,8 @@ CREATE TEMP TABLE technologies_cleaned AS ( SELECT client, page, - r.technologies - FROM impacted_pages - LEFT JOIN reconstructed_technologies r - USING (client, page) + technologies + FROM reconstructed_technologies ); -- Update the crawl.pages table with the cleaned and restored technologies From c1af19a15dc920f1fee837bfcd1f9ca22074f9c6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 20 Jan 2025 21:13:55 +0000 Subject: [PATCH 06/13] typos --- definitions/declarations/httparchive.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/declarations/httparchive.js b/definitions/declarations/httparchive.js index 73c58e1f..c183096d 100644 --- a/definitions/declarations/httparchive.js +++ b/definitions/declarations/httparchive.js @@ -1,7 +1,7 @@ // Tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py ['pages', 'requests', 'parsed_css'].forEach(table => declare({ - schema: 'wappalyzer', + schema: 'crawl_staging', name: table }) ) @@ -27,7 +27,7 @@ GROUP BY date, client, tech -ORDER BY cnt DESC; +ORDER BY cnt DESC `); // Tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml From 233619e81ac362a5af744ebf4f67d58fcd5792fd Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 22:26:50 +0100 Subject: [PATCH 07/13] comments --- definitions/declarations/httparchive.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/definitions/declarations/httparchive.js b/definitions/declarations/httparchive.js index c183096d..4690a8cc 100644 --- a/definitions/declarations/httparchive.js +++ b/definitions/declarations/httparchive.js @@ -1,4 +1,4 @@ -// Tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py +// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py ['pages', 'requests', 'parsed_css'].forEach(table => declare({ schema: 'crawl_staging', @@ -6,14 +6,16 @@ }) ) -assert('corrupted_technology_values').query(ctx => ` +assert('corrupted_technology_values') + .tags(['crawl_complete']) + .query(ctx => ` SELECT date, client, tech, COUNT(DISTINCT page) AS cnt_pages, ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages -FROM ${ctx.ref('crawl_staging', 'pages')} +FROM ${ctx.ref('crawl_staging', 'pages')} AS pages LEFT JOIN pages.technologies AS tech LEFT JOIN tech.categories AS category WHERE @@ -27,10 +29,10 @@ GROUP BY date, client, tech -ORDER BY cnt DESC +ORDER BY cnt_pages DESC `); -// Tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml +// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml ['technologies', 'categories'].forEach(table => declare({ schema: 'wappalyzer', From dd99da192e2ead0e7b94b30d0c5b88ade8f4785d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 22:32:45 +0100 Subject: [PATCH 08/13] alias --- definitions/output/crawl/pages.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index db0f9d91..804635c7 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -133,7 +133,7 @@ CREATE TEMP TABLE technologies_cleaned AS ( ); -- Update the crawl.pages table with the cleaned and restored technologies -UPDATE crawl.pages +UPDATE ${ctx.self()} AS pages SET technologies = technologies_cleaned.technologies FROM technologies_cleaned WHERE pages.date = '${constants.currentMonth}' AND From d2aed77faa8c5cf7990a1cd6044871fcbc71486f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:10:13 +0100 Subject: [PATCH 09/13] Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard --- definitions/output/crawl/pages.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 804635c7..bb914f51 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -78,7 +78,9 @@ CREATE TEMP TABLE technologies_cleaned AS ( name AS technology, categories FROM ${ctx.ref('wappalyzer', 'technologies')} - ), pages AS ( + ), + + pages AS ( SELECT client, page, From 7c192443864ec46fcc74f98708004098f7338b0d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:10:22 +0100 Subject: [PATCH 10/13] Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard --- definitions/output/crawl/pages.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index bb914f51..a46c9138 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -90,7 +90,9 @@ CREATE TEMP TABLE technologies_cleaned AS ( FROM ${ctx.self()} AS pages LEFT JOIN pages.technologies AS tech WHERE date = '${constants.currentMonth}' ${constants.devRankFilter} - ), -- Identify impacted pages + ), + + -- Identify impacted pages impacted_pages AS ( SELECT DISTINCT client, From b9e26f8c24431c63329c345b06b266d3d434617d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:10:29 +0100 Subject: [PATCH 11/13] Update definitions/output/crawl/pages.js Co-authored-by: Barry Pollard --- definitions/output/crawl/pages.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index a46c9138..36d75c30 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -109,7 +109,9 @@ CREATE TEMP TABLE technologies_cleaned AS ( FROM wappalyzer LEFT JOIN wappalyzer.categories AS category ) - ), -- Keep valid technologies and use correct categories + ), + + -- Keep valid technologies and use correct categories reconstructed_technologies AS ( SELECT client, From af676c808b7e8b7459e77491fe8a4dbd84a50e1f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:13:59 +0100 Subject: [PATCH 12/13] lint --- definitions/output/crawl/pages.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js index 36d75c30..b39624f2 100644 --- a/definitions/output/crawl/pages.js +++ b/definitions/output/crawl/pages.js @@ -79,7 +79,7 @@ CREATE TEMP TABLE technologies_cleaned AS ( categories FROM ${ctx.ref('wappalyzer', 'technologies')} ), - + pages AS ( SELECT client, From ce77b9a394eb04b1a5bccce994fbbbd9a984ad48 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Jan 2025 23:38:54 +0100 Subject: [PATCH 13/13] Update definitions/declarations/httparchive.js Co-authored-by: Barry Pollard --- definitions/declarations/httparchive.js | 1 + 1 file changed, 1 insertion(+) diff --git a/definitions/declarations/httparchive.js b/definitions/declarations/httparchive.js index 4690a8cc..aa6d183d 100644 --- a/definitions/declarations/httparchive.js +++ b/definitions/declarations/httparchive.js @@ -6,6 +6,7 @@ }) ) +// See https://github.com/HTTPArchive/dataform/issues/43 assert('corrupted_technology_values') .tags(['crawl_complete']) .query(ctx => `