From 8e7e949279102a8e98940e354a86e03389d977de Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 20 Jun 2024 13:38:48 +0100 Subject: [PATCH 1/3] Convert Element Popularity to use new tables --- sql/histograms/htmlElementPopularity.sql | 33 ++++++++++++++++-------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/sql/histograms/htmlElementPopularity.sql b/sql/histograms/htmlElementPopularity.sql index 60dd99d..3fdebee 100644 --- a/sql/histograms/htmlElementPopularity.sql +++ b/sql/histograms/htmlElementPopularity.sql @@ -2,8 +2,7 @@ CREATE TEMPORARY FUNCTION getElements(payload STRING) RETURNS ARRAY LANGUAGE js AS ''' try { - var $ = JSON.parse(payload); - var elements = JSON.parse($._element_count); + var elements = JSON.parse(payload); if (Array.isArray(elements) || typeof elements != 'object') return []; return Object.keys(elements); } catch (e) { @@ -12,24 +11,36 @@ try { '''; SELECT - _TABLE_SUFFIX AS client, + client, element, - COUNT(DISTINCT url) AS pages, + COUNT(DISTINCT root_page) AS pages, total, - COUNT(DISTINCT url) / total AS pct, - ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls + COUNT(DISTINCT root_page) / total AS pct, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.all.pages` JOIN - (SELECT _TABLE_SUFFIX, COUNT(0) AS total FROM `httparchive.pages.${YYYY_MM_DD}_*` GROUP BY _TABLE_SUFFIX) -USING (_TABLE_SUFFIX), - UNNEST(getElements(payload)) AS element + ( + SELECT + date, + client, + COUNT(DISTINCT root_page) AS total + FROM + `httparchive.all.pages` + WHERE date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND + rank = 1000 + GROUP BY date, client + ) +USING (date, client), + UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element +WHERE + date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') GROUP BY client, total, element HAVING - COUNT(DISTINCT url) >= 10 + COUNT(DISTINCT root_page) >= 10 ORDER BY pages / total DESC, client From 4a75e664f5a097f2ea32e33254c11c8e7e5649f7 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 20 Jun 2024 13:47:01 +0100 Subject: [PATCH 2/3] Linting --- sql/histograms/htmlElementPopularity.sql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/histograms/htmlElementPopularity.sql b/sql/histograms/htmlElementPopularity.sql index 3fdebee..d4653de 100644 --- a/sql/histograms/htmlElementPopularity.sql +++ b/sql/histograms/htmlElementPopularity.sql @@ -29,7 +29,9 @@ JOIN `httparchive.all.pages` WHERE date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND rank = 1000 - GROUP BY date, client + GROUP BY + date, + client ) USING (date, client), UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element From eace463606c26f5e13fd5568407ff0cba0261603 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 20 Jun 2024 13:48:45 +0100 Subject: [PATCH 3/3] More linting --- sql/histograms/htmlElementPopularity.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/histograms/htmlElementPopularity.sql b/sql/histograms/htmlElementPopularity.sql index d4653de..773dc40 100644 --- a/sql/histograms/htmlElementPopularity.sql +++ b/sql/histograms/htmlElementPopularity.sql @@ -27,7 +27,8 @@ JOIN COUNT(DISTINCT root_page) AS total FROM `httparchive.all.pages` - WHERE date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND + WHERE + date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND rank = 1000 GROUP BY date, @@ -36,7 +37,7 @@ JOIN USING (date, client), UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element WHERE - date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') + date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') GROUP BY client, total,