From 6aebd3572f1fb1fea4cf7a1eddbfd1f8a14ec20a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 1 Aug 2025 16:25:16 +0200 Subject: [PATCH 01/13] dates updated --- sql/2025/privacy/ccpa_most_common_phrases.sql | 31 ++ sql/2025/privacy/ccpa_prevalence.sql | 27 ++ .../privacy/cookies_top_first_party_names.sql | 37 ++ .../cookies_top_third_party_domains.sql | 37 ++ .../privacy/cookies_top_third_party_names.sql | 37 ++ .../privacy/easylist-tracker-detection.sql | 41 ++ .../fingerprinting_most_common_apis.sql | 36 ++ .../fingerprinting_most_common_scripts.sql | 23 + .../privacy/fingerprinting_script_count.sql | 21 + .../privacy/most_common_bounce_domains.sql | 89 ++++ sql/2025/privacy/most_common_client_hints.sql | 52 +++ .../most_common_cmps_for_iab_tcf_v2.sql | 27 ++ .../privacy/most_common_cname_domains.sql | 92 ++++ .../most_common_countries_for_iab_tcf_v2.sql | 43 ++ .../privacy/most_common_referrer_policy.sql | 65 +++ .../most_common_strings_for_iab_usp.sql | 27 ++ .../most_common_tracker_categories.sql | 65 +++ ...stered_by_third_parties_and_publishers.sql | 94 ++++ ...er_of_privacy_sandbox_attested_domains.sql | 44 ++ .../number_of_websites_per_technology.sql | 34 ++ ...er_of_websites_per_technology_category.sql | 22 + ..._of_websites_using_each_fingerprinting.sql | 32 ++ .../number_of_websites_with_client_hints.sql | 44 ++ .../privacy/number_of_websites_with_dnt.sql | 34 ++ .../privacy/number_of_websites_with_gpc.sql | 34 ++ .../privacy/number_of_websites_with_iab.sql | 61 +++ .../number_of_websites_with_nb_trackers.sql | 96 ++++ ...number_of_websites_with_referrerpolicy.sql | 88 ++++ ...of_websites_with_related_origin_trials.sql | 105 +++++ ..._of_websites_with_whotracksme_trackers.sql | 43 ++ ...doption-by-third-parties-by-publishers.sql | 150 +++++++ ...inations_registered_by_most_publishers.sql | 83 ++++ ...tions_registered_by_most_third_parties.sql | 83 ++++ sql/util/bq_to_sheets.ipynb | 418 +++++++++--------- 34 files changed, 2006 insertions(+), 209 deletions(-) create mode 100644 sql/2025/privacy/ccpa_most_common_phrases.sql create mode 100644 sql/2025/privacy/ccpa_prevalence.sql create mode 100644 sql/2025/privacy/cookies_top_first_party_names.sql create mode 100644 sql/2025/privacy/cookies_top_third_party_domains.sql create mode 100644 sql/2025/privacy/cookies_top_third_party_names.sql create mode 100644 sql/2025/privacy/easylist-tracker-detection.sql create mode 100644 sql/2025/privacy/fingerprinting_most_common_apis.sql create mode 100644 sql/2025/privacy/fingerprinting_most_common_scripts.sql create mode 100644 sql/2025/privacy/fingerprinting_script_count.sql create mode 100644 sql/2025/privacy/most_common_bounce_domains.sql create mode 100644 sql/2025/privacy/most_common_client_hints.sql create mode 100644 sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql create mode 100644 sql/2025/privacy/most_common_cname_domains.sql create mode 100644 sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql create mode 100644 sql/2025/privacy/most_common_referrer_policy.sql create mode 100644 sql/2025/privacy/most_common_strings_for_iab_usp.sql create mode 100644 sql/2025/privacy/most_common_tracker_categories.sql create mode 100644 sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql create mode 100644 sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql create mode 100644 sql/2025/privacy/number_of_websites_per_technology.sql create mode 100644 sql/2025/privacy/number_of_websites_per_technology_category.sql create mode 100644 sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql create mode 100644 sql/2025/privacy/number_of_websites_with_client_hints.sql create mode 100644 sql/2025/privacy/number_of_websites_with_dnt.sql create mode 100644 sql/2025/privacy/number_of_websites_with_gpc.sql create mode 100644 sql/2025/privacy/number_of_websites_with_iab.sql create mode 100644 sql/2025/privacy/number_of_websites_with_nb_trackers.sql create mode 100644 sql/2025/privacy/number_of_websites_with_referrerpolicy.sql create mode 100644 sql/2025/privacy/number_of_websites_with_related_origin_trials.sql create mode 100644 sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql create mode 100644 sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql create mode 100644 sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql create mode 100644 sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql diff --git a/sql/2025/privacy/ccpa_most_common_phrases.sql b/sql/2025/privacy/ccpa_most_common_phrases.sql new file mode 100644 index 00000000000..ae38070672d --- /dev/null +++ b/sql/2025/privacy/ccpa_most_common_phrases.sql @@ -0,0 +1,31 @@ +WITH pages_with_phrase AS ( + SELECT + client, + rank_grouping, + page, + COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, + JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases + FROM `httparchive.crawl.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2025-07-01' AND + is_root_page = true AND + rank <= rank_grouping AND + array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 +) + +SELECT + client, + rank_grouping, + link_phrase, + COUNT(DISTINCT page) AS num_pages, + COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages +FROM pages_with_phrase, + UNNEST(ccpa_link_phrases) AS link_phrase +GROUP BY + link_phrase, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + num_pages DESC diff --git a/sql/2025/privacy/ccpa_prevalence.sql b/sql/2025/privacy/ccpa_prevalence.sql new file mode 100644 index 00000000000..c51db559ae7 --- /dev/null +++ b/sql/2025/privacy/ccpa_prevalence.sql @@ -0,0 +1,27 @@ +WITH pages AS ( + SELECT + client, + rank_grouping, + page, + JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link + FROM `httparchive.crawl.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2025-07-01' AND + is_root_page = true AND + rank <= rank_grouping +) + +SELECT + client, + rank_grouping, + has_ccpa_link, + COUNT(DISTINCT page) AS num_pages +FROM pages +GROUP BY + has_ccpa_link, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + has_ccpa_link diff --git a/sql/2025/privacy/cookies_top_first_party_names.sql b/sql/2025/privacy/cookies_top_first_party_names.sql new file mode 100644 index 00000000000..5b310e6fb75 --- /dev/null +++ b/sql/2025/privacy/cookies_top_first_party_names.sql @@ -0,0 +1,37 @@ +-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites. + +WITH pages AS ( + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +), + +cookies AS ( + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) + +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2025/privacy/cookies_top_third_party_domains.sql b/sql/2025/privacy/cookies_top_third_party_domains.sql new file mode 100644 index 00000000000..c8feb446e42 --- /dev/null +++ b/sql/2025/privacy/cookies_top_third_party_domains.sql @@ -0,0 +1,37 @@ +WITH pages AS ( + SELECT + page, + client, + root_page, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +), + +cookies AS ( + SELECT + client, + page, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_pages + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) + +SELECT + client, + cookie_host, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_host +ORDER BY + page_count DESC, + client +LIMIT 500 diff --git a/sql/2025/privacy/cookies_top_third_party_names.sql b/sql/2025/privacy/cookies_top_third_party_names.sql new file mode 100644 index 00000000000..981a77da56d --- /dev/null +++ b/sql/2025/privacy/cookies_top_third_party_names.sql @@ -0,0 +1,37 @@ +-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains. + +WITH pages AS ( + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +), + +cookies AS ( + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie +) + +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2025/privacy/easylist-tracker-detection.sql b/sql/2025/privacy/easylist-tracker-detection.sql new file mode 100644 index 00000000000..15a9e2f5115 --- /dev/null +++ b/sql/2025/privacy/easylist-tracker-detection.sql @@ -0,0 +1,41 @@ +CREATE TEMP FUNCTION +CheckDomainInURL(url STRING, domain STRING) +RETURNS INT64 +LANGUAGE js AS """ + return url.includes(domain) ? 1 : 0; +"""; + +-- We need to use the `easylist_adservers.csv` to populate the table to get the list of domains to block +-- https://github.com/easylist/easylist/blob/master/easylist/easylist_adservers.txt +WITH easylist_data AS ( + SELECT string_field_0 + FROM `httparchive.almanac.easylist_adservers` +), + +requests_data AS ( + SELECT url + FROM `httparchive.all.requests` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +), + +block_status AS ( + SELECT + r.url, + MAX( + CASE + WHEN CheckDomainInURL(r.url, e.string_field_0) = 1 THEN 1 + ELSE 0 + END + ) AS should_block + FROM requests_data r + LEFT JOIN easylist_data e + ON CheckDomainInURL(r.url, e.string_field_0) = 1 + GROUP BY r.url +) + +SELECT + COUNT(0) AS blocked_url_count +FROM block_status +WHERE should_block = 1; diff --git a/sql/2025/privacy/fingerprinting_most_common_apis.sql b/sql/2025/privacy/fingerprinting_most_common_apis.sql new file mode 100644 index 00000000000..f7d952e0ad5 --- /dev/null +++ b/sql/2025/privacy/fingerprinting_most_common_apis.sql @@ -0,0 +1,36 @@ +CREATE TEMP FUNCTION getFingerprintingTypes(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ +if (input) { + try { + return Object.keys(JSON.parse(input)) + } catch (e) { + return [] + } +} else { + return [] +} +"""; + +WITH pages AS ( + SELECT + client, + page, + fingerprinting_type, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.crawl.pages`, + UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type + WHERE date = '2025-07-01' +) + +SELECT + client, + fingerprinting_type, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + client, + fingerprinting_type +ORDER BY + page_count DESC diff --git a/sql/2025/privacy/fingerprinting_most_common_scripts.sql b/sql/2025/privacy/fingerprinting_most_common_scripts.sql new file mode 100644 index 00000000000..316c07b50d8 --- /dev/null +++ b/sql/2025/privacy/fingerprinting_most_common_scripts.sql @@ -0,0 +1,23 @@ +WITH pages AS ( + SELECT + page, + client, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +) + +SELECT + client, + script, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script +GROUP BY + client, + script +ORDER BY + page_count DESC +LIMIT 100; diff --git a/sql/2025/privacy/fingerprinting_script_count.sql b/sql/2025/privacy/fingerprinting_script_count.sql new file mode 100644 index 00000000000..3ca08b05326 --- /dev/null +++ b/sql/2025/privacy/fingerprinting_script_count.sql @@ -0,0 +1,21 @@ +WITH pages AS ( + SELECT + page, + client, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' +) + +SELECT + script_count, + client, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + script_count, + client +ORDER BY + script_count ASC; diff --git a/sql/2025/privacy/most_common_bounce_domains.sql b/sql/2025/privacy/most_common_bounce_domains.sql new file mode 100644 index 00000000000..91f007d26f2 --- /dev/null +++ b/sql/2025/privacy/most_common_bounce_domains.sql @@ -0,0 +1,89 @@ +-- Detection logic explained: +-- https://github.com/privacycg/proposals/issues/6 +-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md +WITH redirect_requests AS ( + SELECT + client, + url, + index, + response_headers, + page + FROM `httparchive.crawl.requests` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + type NOT IN ('css', 'image', 'font', 'video', 'audio') AND + ROUND(INT64(summary.status) / 100) = 3 AND + index <= 2 +), + +navigation_redirect AS ( + -- Find the first navigation redirect + SELECT + client, + url, + page, + response_header.value AS navigation_redirect_location + FROM redirect_requests, + UNNEST(response_headers) AS response_header + WHERE + index = 1 AND + LOWER(response_header.name) = 'location' AND + NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page) +), + +bounce_redirect AS ( + -- Find the second navigation redirect + SELECT + client, + url, + page, + response_header.value AS bounce_redirect_location, + response_headers + FROM redirect_requests, + UNNEST(response_headers) AS response_header + WHERE + index = 2 AND + LOWER(response_header.name) = 'location' +), + +bounce_sequences AS ( + -- Combine the first and second navigation redirects + SELECT + nav.client, + NET.REG_DOMAIN(navigation_redirect_location) AS bounce_hostname, + COUNT(DISTINCT nav.page) AS number_of_pages + --ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies + FROM navigation_redirect AS nav + LEFT JOIN bounce_redirect AS bounce + ON + nav.client = bounce.client AND + nav.page = bounce.page AND + nav.navigation_redirect_location = bounce.url + WHERE bounce_redirect_location IS NOT NULL + GROUP BY + nav.client, + bounce_hostname +), + +pages_total AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' AND + is_root_page + GROUP BY client +) + +-- Count the number of websites with bounce tracking per bounce hostname +SELECT + client, + bounce_hostname, + number_of_pages, + number_of_pages / total_pages AS pct_pages +FROM bounce_sequences +JOIN pages_total +USING (client) +ORDER BY number_of_pages DESC +LIMIT 100 diff --git a/sql/2025/privacy/most_common_client_hints.sql b/sql/2025/privacy/most_common_client_hints.sql new file mode 100644 index 00000000000..88c2267abf2 --- /dev/null +++ b/sql/2025/privacy/most_common_client_hints.sql @@ -0,0 +1,52 @@ +# Pages that use Client Hints +WITH response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS header_name, + LOWER(response_header.value) AS header_value, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + is_main_document = TRUE +), + +meta_tags AS ( + SELECT + client, + page, + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, + LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + FROM ( + SELECT + client, + page, + JSON_QUERY(custom_metrics, '$.almanac') AS metrics + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE + ), + UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node + WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL +) + +SELECT + client, + IF(header_name = 'accept-ch', header_value, tag_value) AS value, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM response_headers +FULL OUTER JOIN meta_tags +USING (client, page) +WHERE + header_name = 'accept-ch' OR + tag_name = 'accept-ch' +GROUP BY + client, + value +ORDER BY pct_pages DESC +LIMIT 200 diff --git a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql new file mode 100644 index 00000000000..53f76c63a2f --- /dev/null +++ b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql @@ -0,0 +1,27 @@ +# Counts of CMPs using IAB Transparency & Consent Framework +# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata +# CMP vendor list: https://iabeurope.eu/cmp-list/ + +WITH cmps AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.iab_tcf_v2.data.cmpId') AS cmpId, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +) + +SELECT + client, + cmpId, + COUNT(0) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(0) AS number_of_pages +FROM cmps +GROUP BY + client, + cmpId +ORDER BY + pct_pages DESC diff --git a/sql/2025/privacy/most_common_cname_domains.sql b/sql/2025/privacy/most_common_cname_domains.sql new file mode 100644 index 00000000000..b260273dc7c --- /dev/null +++ b/sql/2025/privacy/most_common_cname_domains.sql @@ -0,0 +1,92 @@ +# Most common CNAME domains +CREATE TEMP FUNCTION convert_cname_json(json_str STRING) +RETURNS ARRAY> +LANGUAGE js AS """ +try { + const obj = JSON.parse(json_str); + const result = []; + for (const key in obj) { + result.push({ + origin: key, + cname: obj[key] + }); + } + return result; +} catch (e) { + return []; +} +"""; + +# Adguard CNAME Trackers source: +# https://github.com/AdguardTeam/cname-trackers/blob/master/script/src/cloaked-trackers.json +WITH adguard_trackers AS ( + SELECT + domain + FROM UNNEST(['cz.affilbox.cz', 'pl02.prolitteris.2cnt.net', 'a8.net', 'mm.actionlink.jp', 'mr-in.com', 'ebis.ne.jp', '0i0i0i0.com', 'ads.bid', 'at-o.net', 'actonservice.com', 'actonsoftware.com', '2o7.net', 'data.adobedc.net', 'sc.adobedc.net', 'sc.omtrdc.net', 'adocean.pl', 'aquaplatform.com', 'cdn18685953.ahacdn.me', 'thirdparty.bnc.lt', 'api.clickaine.com', 'tagcommander.com', 'track.sp.crdl.io', 'dnsdelegation.io', 'storetail.io', 'e.customeriomail.com', 'dataunlocker.com', 'monopoly-drain.ga', 'friendly-community.tk', 'nc0.co', 'customer.etracker.com', 'eulerian.net', 'extole.com', 'extole.io', 'fathomdns.com', 'genieespv.jp', 'ad-cloud.jp', 'goatcounter.com', 'heleric.com', 'iocnt.net', 'affex.org', 'k.keyade.com', 'ghochv3eng.trafficmanager.net', 'online-metrix.net', 'logly.co.jp', 'mailgun.org', 'ab1n.net', 'ntv.io', 'ntvpforever.com', 'postrelease.com', 'non.li', 'tracking.bp01.net', 't.eloqua.com', 'oghub.io', 'go.pardot.com', 'parsely.com', 'custom.plausible.io', 'popcashjs.b-cdn.net', 'rdtk.io', 'sailthru.com', 'exacttarget.com', 'a351fec2c318c11ea9b9b0a0ae18fb0b-1529426863.eu-central-1.elb.amazonaws.com', 'a5e652663674a11e997c60ac8a4ec150-1684524385.eu-central-1.elb.amazonaws.com', 'a88045584548111e997c60ac8a4ec150-1610510072.eu-central-1.elb.amazonaws.com', 'afc4d9aa2a91d11e997c60ac8a4ec150-2082092489.eu-central-1.elb.amazonaws.com', 'e.truedata.co', 'utiq-aws.net', 'webtrekk.net', 'wt-eu02.net', 'ak-is2.net', 'wizaly.com']) AS domain +), + +whotracksme AS ( + SELECT DISTINCT + domain, + category + FROM `httparchive.almanac.whotracksme` + WHERE date = '2025-07-01' +), + +cnames AS ( + SELECT + client, + cnames.cname, + page + --ARRAY_AGG(DISTINCT page LIMIT 2) AS page_examples + FROM `httparchive.crawl.pages`, + UNNEST(convert_cname_json(JSON_QUERY(custom_metrics, '$.privacy.request_hostnames_with_cname'))) AS cnames + WHERE date = '2025-07-01' AND + NET.REG_DOMAIN(cnames.origin) = NET.REG_DOMAIN(page) AND + NET.REG_DOMAIN(cnames.cname) != NET.REG_DOMAIN(page) + GROUP BY + client, + cnames.cname, + page +), + +pages_total AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + GROUP BY client +), + +cname_stats AS ( + SELECT + client, + NET.REG_DOMAIN(cname) AS cname, + adguard_trackers.domain IS NOT NULL AS adguard_known_cname, + whotracksme.category AS whotracksme_category, + COUNT(DISTINCT page) AS number_of_pages + --ANY_VALUE(page_examples) + FROM cnames + LEFT JOIN adguard_trackers + ON ENDS_WITH(cnames.cname, adguard_trackers.domain) + LEFT JOIN whotracksme + ON ENDS_WITH(cnames.cname, whotracksme.domain) + GROUP BY + client, + cname, + adguard_known_cname, + whotracksme_category +) + +SELECT + client, + cname, + adguard_known_cname, + whotracksme_category, + number_of_pages, + number_of_pages / total_pages AS pct_pages +FROM cname_stats +LEFT JOIN pages_total +USING (client) +ORDER BY number_of_pages DESC diff --git a/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql new file mode 100644 index 00000000000..891f58fdb62 --- /dev/null +++ b/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql @@ -0,0 +1,43 @@ +# Counts of countries for publishers using IAB Transparency & Consent Framework +# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata +# "Country code of the country that determines the legislation of +# reference. Normally corresponds to the country code of the country +# in which the publisher's business entity is established." + +WITH totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_websites + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object' + GROUP BY client +), + +cmps AS ( + SELECT + client, + --ANY_VALUE(custom_metrics.privacy.iab_tcf_v2.data) AS example, + STRING(custom_metrics.privacy.iab_tcf_v2.data.publisherCC) AS publisherCC, + COUNT(DISTINCT root_page) AS number_of_pages + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object' + GROUP BY + client, + publisherCC +) + +SELECT + client, + publisherCC, + --example, + number_of_pages / total_websites AS pct_of_pages +FROM cmps +JOIN totals +USING (client) +ORDER BY + client, + number_of_pages DESC diff --git a/sql/2025/privacy/most_common_referrer_policy.sql b/sql/2025/privacy/most_common_referrer_policy.sql new file mode 100644 index 00000000000..de96ae3a3a7 --- /dev/null +++ b/sql/2025/privacy/most_common_referrer_policy.sql @@ -0,0 +1,65 @@ +# Most common values for Referrer-Policy (at site level) + +WITH totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE + GROUP BY client +), + +referrer_policy_custom_metrics AS ( + SELECT + client, + page, + LOWER(TRIM(policy_meta)) AS policy_meta + FROM `httparchive.crawl.pages`, + UNNEST(SPLIT(JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy'), ',')) AS policy_meta + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +), + +response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS name, + LOWER(response_header.value) AS value + FROM `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE +), + +referrer_policy_headers AS ( + SELECT + client, + page, + TRIM(policy_header) AS policy_header + FROM response_headers, + UNNEST(SPLIT(value, ',')) AS policy_header + WHERE + name = 'referrer-policy' +) + +SELECT + client, + COALESCE(policy_header, policy_meta) AS policy, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM referrer_policy_custom_metrics +FULL OUTER JOIN referrer_policy_headers +USING (client, page) +JOIN totals +USING (client) +GROUP BY + client, + policy +ORDER BY + pct_pages DESC +LIMIT 100 diff --git a/sql/2025/privacy/most_common_strings_for_iab_usp.sql b/sql/2025/privacy/most_common_strings_for_iab_usp.sql new file mode 100644 index 00000000000..d6cfe4af0c9 --- /dev/null +++ b/sql/2025/privacy/most_common_strings_for_iab_usp.sql @@ -0,0 +1,27 @@ +# Counts of US Privacy String values for websites using IAB US Privacy Framework +# cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md + +WITH usp_data AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.iab_usp.privacy_string.uspString') AS uspString, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS pages_total + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +) + +SELECT + client, + uspString, + COUNT(DISTINCT page) / ANY_VALUE(pages_total) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM usp_data +GROUP BY + client, + uspString +ORDER BY + pct_pages DESC +LIMIT 100 diff --git a/sql/2025/privacy/most_common_tracker_categories.sql b/sql/2025/privacy/most_common_tracker_categories.sql new file mode 100644 index 00000000000..c93aeadcac8 --- /dev/null +++ b/sql/2025/privacy/most_common_tracker_categories.sql @@ -0,0 +1,65 @@ +# Percent of pages that deploy at least one tracker from each tracker category +WITH whotracksme AS ( + SELECT + domain, + category, + tracker + FROM httparchive.almanac.whotracksme + WHERE date = '2025-07-01' +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_websites + FROM httparchive.crawl.requests + WHERE + date = '2025-07-01' + GROUP BY client +), + +tracker_categories AS ( + SELECT + client, + category, + page + FROM httparchive.crawl.requests + JOIN whotracksme + ON ( + NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), CONCAT('.', domain)) + ) + WHERE + date = '2025-07-01' AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) -- third party +), + +aggregated AS ( + SELECT + client, + category, + COUNT(DISTINCT page) AS number_of_websites + FROM tracker_categories + GROUP BY + client, + category + UNION ALL + SELECT + client, + 'any' AS category, + COUNT(DISTINCT page) AS number_of_websites + FROM tracker_categories + GROUP BY + client +) + +SELECT + client, + category, + number_of_websites, + total_websites, + number_of_websites / total_websites AS pct_websites +FROM aggregated +JOIN totals +USING (client) +ORDER BY number_of_websites DESC diff --git a/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql new file mode 100644 index 00000000000..c8fd1e038a0 --- /dev/null +++ b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql @@ -0,0 +1,94 @@ +#standardSQL +# Number of Attribution Reporting API Destinations (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed AS "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + NET.REG_DOMAIN(page) AS publisher, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + third_party_domain + FROM `httparchive.crawl.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' +) + +SELECT + client, + rank_group, + COUNT(destination) AS total_destinations, + COUNT(DISTINCT destination) AS distinct_destinations, + ROUND(COUNT(DISTINCT destination) * 100 / COUNT(destination), 2) AS destination_pct, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains, + ROUND(COUNT(DISTINCT third_party_domain) * 100 / COUNT(third_party_domain), 2) AS third_party_domain_pct, + COUNT(publisher) AS total_publishers, + COUNT(DISTINCT publisher) AS distinct_publishers, + ROUND(COUNT(DISTINCT publisher) * 100 / COUNT(publisher), 2) AS publisher_pct +FROM ara_features +WHERE destination IS NOT NULL AND third_party_domain IS NOT NULL +GROUP BY client, rank_group +ORDER BY + client, + CASE rank_group + WHEN '1000' THEN 1 + WHEN '10000' THEN 2 + WHEN '100000' THEN 3 + WHEN '1000000' THEN 4 + WHEN '10000000' THEN 5 + ELSE 6 + END; diff --git a/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql new file mode 100644 index 00000000000..d439a0a9c1f --- /dev/null +++ b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql @@ -0,0 +1,44 @@ +#standardSQL +# Privacy Sandbox Attestation and Related Websites JSON status (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) + +WITH wellknown AS ( + SELECT + client, + NET.HOST(page) AS host, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/related-website-set.json".found') AS BOOL) AS rws, + CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/privacy-sandbox-attestations.json".found') AS BOOL) AS attestation + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +) + +SELECT + client, + rank_group, + SUM(CASE WHEN rws THEN 1 ELSE 0 END) AS related_websites_set, + SUM(CASE WHEN attestation THEN 1 ELSE 0 END) AS privacy_sandbox_attestation +FROM + wellknown +WHERE + rws OR attestation +GROUP BY client, rank_group +ORDER BY + client, + CASE rank_group + WHEN '1000' THEN 1 + WHEN '10000' THEN 2 + WHEN '100000' THEN 3 + WHEN '1000000' THEN 4 + WHEN '10000000' THEN 5 + ELSE 6 + END; diff --git a/sql/2025/privacy/number_of_websites_per_technology.sql b/sql/2025/privacy/number_of_websites_per_technology.sql new file mode 100644 index 00000000000..ff808a4eafc --- /dev/null +++ b/sql/2025/privacy/number_of_websites_per_technology.sql @@ -0,0 +1,34 @@ +WITH technologies AS ( + SELECT + client, + page, + category, + technology, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS category + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +) + +SELECT + client, + technology, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages, + ARRAY_AGG(DISTINCT category) AS categories +FROM technologies +WHERE + category IN ( + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ) +GROUP BY + client, + technology +ORDER BY + pct_pages DESC diff --git a/sql/2025/privacy/number_of_websites_per_technology_category.sql b/sql/2025/privacy/number_of_websites_per_technology_category.sql new file mode 100644 index 00000000000..ac40b734f59 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_per_technology_category.sql @@ -0,0 +1,22 @@ +SELECT + client, + category, + COUNT(DISTINCT IF(category = tech_category, page, NULL)) / COUNT(DISTINCT page) AS pct_pages, + COUNT(DISTINCT IF(category = tech_category, page, NULL)) AS number_of_pages +FROM `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS tech_category, + UNNEST([ + 'Analytics', 'Browser fingerprinting', 'Customer data platform', + 'Geolocation', + 'Advertising', 'Retargeting', 'Personalisation', 'Segmentation', + 'Cookie compliance' + ]) AS category +WHERE + date = '2025-07-01' AND + is_root_page = TRUE +GROUP BY + client, + category +ORDER BY + pct_pages DESC diff --git a/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql b/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql new file mode 100644 index 00000000000..67068c3fca4 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql @@ -0,0 +1,32 @@ +# Percent of websites using a fingerprinting library based on wappalyzer category +WITH totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_websites + FROM httparchive.crawl.pages + WHERE + date = '2025-07-01' + GROUP BY + client +) + +SELECT + client, + technology.technology, + total_websites, + COUNT(DISTINCT page) AS number_of_websites, + COUNT(DISTINCT page) / total_websites AS percent_of_websites +FROM httparchive.crawl.pages +JOIN totals USING (client), + UNNEST(technologies) AS technology, + UNNEST(technology.categories) AS category +WHERE + date = '2025-07-01' AND + category = 'Browser fingerprinting' +GROUP BY + client, + total_websites, + technology +ORDER BY + client, + number_of_websites DESC diff --git a/sql/2025/privacy/number_of_websites_with_client_hints.sql b/sql/2025/privacy/number_of_websites_with_client_hints.sql new file mode 100644 index 00000000000..7953b46f929 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_client_hints.sql @@ -0,0 +1,44 @@ +WITH response_headers AS ( + SELECT + client, + page, + LOWER(response_header.name) AS header_name, + LOWER(response_header.value) AS header_value, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2025-07-01' AND + is_main_document = TRUE +), + +meta_tags AS ( + SELECT + client, + page, + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, + LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + FROM ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.almanac') AS metrics + FROM `httparchive.crawl.pages` + WHERE date = '2025-07-01' + ), + UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node + WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL +) + +SELECT + client, + COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM response_headers +FULL OUTER JOIN meta_tags +USING (client, page) +WHERE + header_name = 'accept-ch' OR + tag_name = 'accept-ch' +GROUP BY client +ORDER BY pct_pages DESC diff --git a/sql/2025/privacy/number_of_websites_with_dnt.sql b/sql/2025/privacy/number_of_websites_with_dnt.sql new file mode 100644 index 00000000000..8cace44174e --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_dnt.sql @@ -0,0 +1,34 @@ +# Pages that request DNT status + +WITH blink AS ( + SELECT DISTINCT + client, + num_urls, + pct_urls + FROM `httparchive.blink_features.usage` + WHERE + yyyymmdd = '20250601' AND + feature IN ('NavigatorDoNotTrack') +), + +pages AS ( + SELECT + client, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) AS num_urls, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_urls + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE + GROUP BY client +) + +SELECT + COALESCE(blink.client, pages.client) AS client, + blink.num_urls AS number_of_pages_usage_per_blink, + blink.pct_urls AS pct_of_websites_usage_per_blink, + pages.num_urls AS number_of_pages_usage_per_custom_metric, + pages.pct_urls AS pct_of_websites_usage_per_custom_metric +FROM blink +FULL OUTER JOIN pages +ON blink.client = pages.client diff --git a/sql/2025/privacy/number_of_websites_with_gpc.sql b/sql/2025/privacy/number_of_websites_with_gpc.sql new file mode 100644 index 00000000000..2b03afb7ebc --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_gpc.sql @@ -0,0 +1,34 @@ +# Pages that provide `/.well-known/gpc.json` for Global Privacy Control + +WITH pages AS ( + SELECT + client, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_well_known, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) AS number_of_pages_well_known, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_js_api, + COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) AS number_of_pages_js_api + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE + GROUP BY client +), + +headers AS ( + SELECT + client, + COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_headers, + COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) AS number_of_pages_headers + FROM `httparchive.all.requests`, + UNNEST(response_headers) headers + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + is_main_document = TRUE + GROUP BY client +) + +SELECT * +FROM pages +FULL OUTER JOIN headers +USING (client) diff --git a/sql/2025/privacy/number_of_websites_with_iab.sql b/sql/2025/privacy/number_of_websites_with_iab.sql new file mode 100644 index 00000000000..a2090b5df38 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_iab.sql @@ -0,0 +1,61 @@ +# Counts of pages with IAB Frameworks +# TODO: check presence of multiple frameworks per page + +WITH privacy_custom_metrics_data AS ( + SELECT + client, + JSON_QUERY(custom_metrics, '$.privacy') AS metrics + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +) + +SELECT + client, + number_of_pages_with_tcfv1 / number_of_pages AS pct_pages_with_tcfv1, + number_of_pages_with_tcfv1, + number_of_pages_with_tcfv2 / number_of_pages AS pct_pages_with_tcfv2, + number_of_pages_with_tcfv2, + number_of_pages_with_usp / number_of_pages AS pct_pages_with_usp, + number_of_pages_with_usp, + number_of_pages_with_tcf / number_of_pages AS pct_pages_with_tcf, + number_of_pages_with_tcf, + number_of_pages_with_any / number_of_pages AS pct_pages_with_any, + number_of_pages_with_any, + number_of_pages_with_tcfv1_compliant / number_of_pages AS pct_pages_with_tcfv1_compliant, + number_of_pages_with_tcfv1_compliant, + number_of_pages_with_tcfv2_compliant / number_of_pages AS pct_pages_with_tcfv2_compliant, + number_of_pages_with_tcfv2_compliant, + number_of_pages_with_gpp / number_of_pages AS pct_pages_with_gpp, + number_of_pages_with_gpp, + number_of_pages_with_gpp_data / number_of_pages AS pct_pages_with_gpp_data, + number_of_pages_with_gpp_data +FROM ( + SELECT + client, + COUNT(0) AS number_of_pages, + COUNTIF(tcfv1) AS number_of_pages_with_tcfv1, + COUNTIF(tcfv2) AS number_of_pages_with_tcfv2, + COUNTIF(usp) AS number_of_pages_with_usp, + COUNTIF(tcfv1 OR tcfv2) AS number_of_pages_with_tcf, + COUNTIF(tcfv1 OR tcfv2 OR usp OR gpp) AS number_of_pages_with_any, + COUNTIF(tcfv1 AND tcfv1_compliant) AS number_of_pages_with_tcfv1_compliant, + COUNTIF(tcfv2 AND tcfv2_compliant) AS number_of_pages_with_tcfv2_compliant, + COUNTIF(gpp) AS number_of_pages_with_gpp, + COUNTIF(gpp_data) AS number_of_pages_with_gpp_data + FROM ( + SELECT + client, + JSON_VALUE(metrics, '$.iab_tcf_v1.present') = 'true' AS tcfv1, + JSON_VALUE(metrics, '$.iab_tcf_v2.present') = 'true' AS tcfv2, + JSON_VALUE(metrics, '$.iab_gpp.present') = 'true' AS gpp, + JSON_VALUE(metrics, '$.iab_usp.present') = 'true' AS usp, + JSON_VALUE(metrics, '$.iab_tcf_v1.compliant_setup') = 'true' AS tcfv1_compliant, + JSON_VALUE(metrics, '$.iab_tcf_v2.compliant_setup') = 'true' AS tcfv2_compliant, + JSON_VALUE(metrics, '$.iab_gpp.data') IS NOT NULL AS gpp_data + FROM + privacy_custom_metrics_data + ) + GROUP BY client +) diff --git a/sql/2025/privacy/number_of_websites_with_nb_trackers.sql b/sql/2025/privacy/number_of_websites_with_nb_trackers.sql new file mode 100644 index 00000000000..e8570c65e93 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_nb_trackers.sql @@ -0,0 +1,96 @@ +# Number of websites that deploy a certain number of trackers +WITH whotracksme AS ( + SELECT + domain, + category, + tracker + FROM almanac.whotracksme + WHERE date = '2025-07-01' +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_websites + FROM httparchive.crawl.requests + WHERE date = '2025-07-01' + GROUP BY client +) + +SELECT + client, + 'any' AS type, + number_of_trackers, + COUNT(DISTINCT page) AS number_of_websites, + total_websites, + COUNT(DISTINCT page) / total_websites AS pct_websites +FROM ( + SELECT + client, + page, + COUNT(DISTINCT tracker) AS number_of_trackers + FROM httparchive.crawl.requests + JOIN whotracksme + ON ( + NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), CONCAT('.', domain)) + ) + WHERE + date = '2025-07-01' AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) + GROUP BY + client, + page +) +JOIN + totals +USING (client) +GROUP BY + client, + number_of_trackers, + total_websites +UNION ALL +SELECT + client, + 'any_tracker' AS type, + number_of_trackers, + COUNT(DISTINCT page) AS number_of_websites, + total_websites, + COUNT(DISTINCT page) / total_websites AS pct_websites +FROM ( + SELECT + client, + page, + COUNT(DISTINCT tracker) AS number_of_trackers + FROM httparchive.almanac.requests + JOIN + whotracksme + ON ( + NET.HOST(urlShort) = domain OR + ENDS_WITH(NET.HOST(urlShort), CONCAT('.', domain)) + ) + WHERE + date = '2025-07-01' AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) AND -- third party + ( + -- categories selected from https://whotracks.me/blog/tracker_categories.html + whotracksme.category = 'advertising' OR + whotracksme.category = 'pornvertising' OR + whotracksme.category = 'site_analytics' OR + whotracksme.category = 'social_media' + ) + GROUP BY + client, + page +) +JOIN + totals +USING (client) +GROUP BY + client, + number_of_trackers, + total_websites +ORDER BY + client, + type, + number_of_trackers diff --git a/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql b/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql new file mode 100644 index 00000000000..0b91d2169ab --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql @@ -0,0 +1,88 @@ +WITH referrer_policy_custom_metrics AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy') AS meta_policy, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.referrerPolicy.individual_requests')) > 0 AS individual_requests, + CAST(JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.link_relations.A') AS INT64) > 0 AS link_relations + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +), + +referrer_policy_headers AS ( + SELECT + client, + page, + LOWER(response_header.value) AS header_policy + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + is_main_document = TRUE AND + response_header.name = 'referrer-policy' +) + +SELECT + client, + number_of_pages_with_entire_document_policy_meta / number_of_pages AS pct_pages_with_entire_document_policy_meta, + number_of_pages_with_entire_document_policy_meta, + number_of_pages_with_entire_document_policy_header / number_of_pages AS pct_pages_with_entire_document_policy_header, + number_of_pages_with_entire_document_policy_header, + number_of_pages_with_entire_document_policy / number_of_pages AS pct_pages_with_entire_document_policy, + number_of_pages_with_entire_document_policy, + number_of_pages_with_any_individual_requests / number_of_pages AS pct_pages_with_any_individual_requests, + number_of_pages_with_any_individual_requests, + number_of_pages_with_any_link_relations / number_of_pages AS pct_pages_with_any_link_relations, + number_of_pages_with_any_link_relations, + number_of_pages_with_any_referrer_policy / number_of_pages AS pct_pages_with_any_referrer_policy, + number_of_pages_with_any_referrer_policy +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS number_of_pages, + COUNT(DISTINCT IF( + meta_policy IS NOT NULL, + page, NULL + )) AS number_of_pages_with_entire_document_policy_meta, + COUNT(DISTINCT IF( + header_policy IS NOT NULL, + page, NULL + )) AS number_of_pages_with_entire_document_policy_header, + COUNT( + DISTINCT IF( + meta_policy IS NOT NULL OR + header_policy IS NOT NULL, + page, NULL + ) + ) AS number_of_pages_with_entire_document_policy, + COUNT(DISTINCT IF( + individual_requests, + page, NULL + )) AS number_of_pages_with_any_individual_requests, + COUNT(DISTINCT IF( + link_relations, + page, NULL + )) AS number_of_pages_with_any_link_relations, + COUNT( + DISTINCT IF( + meta_policy IS NOT NULL OR + header_policy IS NOT NULL OR + individual_requests OR + link_relations, + page, NULL + ) + ) AS number_of_pages_with_any_referrer_policy + FROM + referrer_policy_custom_metrics + FULL OUTER JOIN + referrer_policy_headers + USING (client, page) + GROUP BY client +) +ORDER BY + client diff --git a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql new file mode 100644 index 00000000000..591c302fa86 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql @@ -0,0 +1,105 @@ +# Pages that participate in the privacy-relayed origin trials +CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( + SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) +); + +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) +RETURNS STRUCT< + token STRING, + origin STRING, + feature STRING, + expiry TIMESTAMP, + is_subdomain BOOL, + is_third_party BOOL +> AS ( + STRUCT( + DECODE_ORIGIN_TRIAL(token) AS token, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party + ) +); + +WITH pages AS ( + SELECT + client, + page, + JSON_QUERY(custom_metrics, '$.origin-trials') AS ot_metrics, + JSON_QUERY(custom_metrics, '$.almanac') AS almanac_metrics + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +), + +response_headers AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(response_header.value) AS ot -- may not lowercase this value as it is a base64 string + FROM `httparchive.all.requests`, + UNNEST(response_headers) response_header + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + is_main_document = TRUE AND + LOWER(response_header.name) = 'origin-trial' +), + +meta_tags AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(JSON_VALUE(meta_node, '$.content')) AS ot -- may not lowercase this value as it is a base64 string + FROM pages, + UNNEST(JSON_QUERY_ARRAY(almanac_metrics, '$.meta-nodes.nodes')) meta_node + WHERE + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = 'origin-trial' +), + +ot_from_custom_metric AS ( + SELECT + client, + page, + PARSE_ORIGIN_TRIAL(JSON_VALUE(metric, '$.token')) AS ot + FROM pages, + UNNEST(JSON_QUERY_ARRAY(ot_metrics)) metric +) + +SELECT + client, + feature, + number_of_pages / total_pages AS pct_pages, + number_of_pages, + is_active +FROM ( + SELECT + client, + ot.feature, + ot.expiry >= CURRENT_TIMESTAMP() AS is_active, + COUNT(DISTINCT page) AS number_of_pages + FROM ( + SELECT * FROM response_headers + UNION ALL + SELECT * FROM meta_tags + UNION ALL + SELECT * FROM ot_from_custom_metric + ) + GROUP BY + client, + feature, + is_active +) +LEFT JOIN ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages + FROM pages + GROUP BY + client +) +USING (client) +ORDER BY + number_of_pages DESC diff --git a/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql b/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql new file mode 100644 index 00000000000..209061c0e72 --- /dev/null +++ b/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql @@ -0,0 +1,43 @@ +WITH whotracksme AS ( + SELECT + domain, + category, + tracker + FROM `max-ostapenko.Public.whotracksme` + WHERE date = '2025-07-01' +), + +pre_aggregated AS ( + SELECT + client, + category, + page, + tracker, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.requests` + JOIN whotracksme + ON NET.REG_DOMAIN(url) = domain + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) -- third party + GROUP BY + client, + category, + tracker, + page +) + +SELECT + client, + category, + tracker, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages, + COUNT(DISTINCT page) AS number_of_pages +FROM pre_aggregated +GROUP BY + client, + category, + tracker +ORDER BY + pct_pages DESC diff --git a/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql new file mode 100644 index 00000000000..9d46cd2c71e --- /dev/null +++ b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql @@ -0,0 +1,150 @@ +#standardSQL +# Adoption of different Privacy Sandbox (PS) features by different third-parties and by different publishers + +-- Extracting third-parties observed using PS APIs on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting PS APIs being called by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + + function splitByDelimiters(value) { + const delimiterRegex = new RegExp(',|, |\\n|\\u0000', 'g'); + return value.split(delimiterRegex).map(v => v.trim()).filter(v => v); + } + + const result = []; + const replacements = { + 'Ch': 'CH', 'Ua': 'UA', 'Wow64': 'WoW64', 'Dpr': 'DPR', 'Rtt': 'RTT', 'Ect': 'ECT', 'Etc': 'ETC', '-Architecture': '-Arch', '-Arc': '-Arch', '-Archh': '-Arch', + '-Factors': '-Factor', '-ETC': '-ECT', '-Modal': '-Model', '-UA-UA': '-UA', '-UAm': '-UA', 'UAmodel': 'UA-Model', 'UAplatform': 'UA-Platform', 'Secch-UA': 'Sec-CH-UA', + 'CH-Width': 'CH-Viewport-Width', '-UAodel': '-UA-Model', '-Platformua-Platform': '-Platform', '-Platformuser-Agent': '-Platform', '-Version"': '-Version' + }; + values.forEach(value => { + if (value.startsWith('accept-ch|')) { + const parts = splitByDelimiters(value.replace('accept-ch|', '')); + parts.forEach(part => { + if (["UA", "Arch", "Bitness", "Full-Version-List", "Mobile", "Model", "Platform", "Platform-Version", "WoW64"].includes(part)) { + result.push("Sec-CH-UA-" + part); + } else { + let formattedPart = part.split('-').map(segment => + segment.charAt(0).toUpperCase() + segment.slice(1).toLowerCase() + ).join('-'); + for (const [key, value] of Object.entries(replacements)) { + formattedPart = formattedPart.replace(new RegExp(key, 'g'), value); + } + result.push(formattedPart); + } + }); + } else { + result.push(value); + } + }); + + return result; +"""; + +WITH privacy_sandbox_features AS ( + SELECT + client, + CASE + WHEN rank <= 1000 THEN '1000' + WHEN rank <= 10000 THEN '10000' + WHEN rank <= 100000 THEN '100000' + WHEN rank <= 1000000 THEN '1000000' + WHEN rank <= 10000000 THEN '10000000' + ELSE 'Other' + END AS rank_group, + NET.REG_DOMAIN(page) AS publisher, + third_party_domain, + CASE + WHEN api LIKE '%opics%|%' + THEN + REPLACE(SUBSTR(api, 0, STRPOS(api, '|') - 1) || '-' || SPLIT(api, '|')[SAFE_OFFSET(1)], '|', '-') + WHEN api LIKE 'attribution-reporting-register-source%' + THEN + SPLIT(api, '|')[OFFSET(0)] + ELSE + api + END AS feature + FROM `httparchive.crawl.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS api + WHERE + date = '2025-07-01' AND + is_root_page = TRUE +), + +grouped_features AS ( + SELECT + rank_group, + feature, + COUNT(DISTINCT publisher) AS publisher_count, + COUNT(DISTINCT third_party_domain) AS third_party_count + FROM privacy_sandbox_features + GROUP BY rank_group, feature +), + +aggregated_features AS ( + SELECT + feature, + SUM(CASE WHEN rank_group = '1000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_1000, + SUM(CASE WHEN rank_group = '1000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_1000, + SUM(CASE WHEN rank_group = '10000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_10000, + SUM(CASE WHEN rank_group = '10000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_10000, + SUM(CASE WHEN rank_group = '100000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_100000, + SUM(CASE WHEN rank_group = '100000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_100000, + SUM(CASE WHEN rank_group = '1000000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_1000000, + SUM(CASE WHEN rank_group = '1000000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_1000000, + SUM(CASE WHEN rank_group = '10000000' THEN publisher_count ELSE 0 END) AS total_publisher_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN publisher_count ELSE 0 END) AS distinct_publisher_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN third_party_count ELSE 0 END) AS total_third_parties_leq_10000000, + SUM(CASE WHEN rank_group = '10000000' THEN third_party_count ELSE 0 END) AS distinct_third_parties_leq_10000000 + FROM grouped_features + GROUP BY feature +) + +SELECT + feature AS privacy_sandbox_features, + total_publisher_leq_1000, + distinct_publisher_leq_1000, + total_third_parties_leq_1000, + distinct_third_parties_leq_1000, + total_publisher_leq_10000, + distinct_publisher_leq_10000, + total_third_parties_leq_10000, + distinct_third_parties_leq_10000, + total_publisher_leq_100000, + distinct_publisher_leq_100000, + total_third_parties_leq_100000, + distinct_third_parties_leq_100000, + total_publisher_leq_1000000, + distinct_publisher_leq_1000000, + total_third_parties_leq_1000000, + distinct_third_parties_leq_1000000, + total_publisher_leq_10000000, + distinct_publisher_leq_10000000, + total_third_parties_leq_10000000, + distinct_third_parties_leq_10000000 +FROM aggregated_features +ORDER BY feature; diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql new file mode 100644 index 00000000000..6e192dda53a --- /dev/null +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql @@ -0,0 +1,83 @@ +#standardSQL +# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct publishers (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + COUNT(NET.REG_DOMAIN(page)) AS total_publishers, + COUNT(DISTINCT NET.REG_DOMAIN(page)) AS distinct_publishers, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains + FROM `httparchive.crawl.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' + GROUP BY client, destination + HAVING destination IS NOT NULL +), + +ranked_features AS ( + SELECT + client, + destination, + total_publishers, + distinct_publishers, + total_third_party_domains, + distinct_third_party_domains, + ROW_NUMBER() OVER ( + PARTITION BY client + ORDER BY distinct_publishers DESC + ) AS publisher_rank + FROM ara_features +) + +SELECT * FROM ranked_features +WHERE publisher_rank <= 25 +ORDER BY client, distinct_publishers DESC; diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql new file mode 100644 index 00000000000..67c9142326f --- /dev/null +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql @@ -0,0 +1,83 @@ +#standardSQL +# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct third-parties (at site level) + +-- Extracting third-parties observed using ARA API on a publisher +CREATE TEMP FUNCTION jsonObjectKeys(input STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + return Object.keys(JSON.parse(input)); +"""; + +-- Extracting ARA API source registration details being passed by a given third-party (passed as "key") +CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +RETURNS ARRAY +LANGUAGE js AS """ + if (!input) { + return []; + } + const jsonObject = JSON.parse(input); + const values = jsonObject[key] || []; + const result = []; + + values.forEach(value => { + if (value.toLowerCase().startsWith('attribution-reporting-register-source|')) { + const parts = value.replace('attribution-reporting-register-source|', '').split('|'); + parts.forEach(part => { + if (part.startsWith('destination=')) { + const destinations = part.replace('destination=', '').split(','); + destinations.forEach(destination => { + result.push('destination=' + destination.trim()); + }); + } else { + result.push(part.trim()); + } + }); + } + }); + + return result; +"""; + +WITH ara_features AS ( + SELECT + client, + CASE + WHEN ara LIKE 'destination=%' THEN NET.REG_DOMAIN(REPLACE(ara, 'destination=', '')) + ELSE NULL + END AS destination, + COUNT(NET.REG_DOMAIN(page)) AS total_publishers, + COUNT(DISTINCT NET.REG_DOMAIN(page)) AS distinct_publishers, + COUNT(third_party_domain) AS total_third_party_domains, + COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains + FROM `httparchive.crawl.pages`, + UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, + UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + ara LIKE 'destination%' + GROUP BY client, destination + HAVING destination IS NOT NULL +), + +ranked_features AS ( + SELECT + client, + destination, + total_publishers, + distinct_publishers, + total_third_party_domains, + distinct_third_party_domains, + ROW_NUMBER() OVER ( + PARTITION BY client + ORDER BY distinct_third_party_domains DESC + ) AS third_party_domain_rank + FROM ara_features +) + +SELECT * FROM ranked_features +WHERE third_party_domain_rank <= 25 +ORDER BY client, distinct_third_party_domains DESC; diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index e32047eecb3..b95cd9eab1a 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -1,216 +1,216 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OVkCxlRQH6Yt", - "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" - }, - "outputs": [], - "source": [ - "# @title Download repo\n", - "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "U37785Bxt5tE" - }, - "outputs": [], - "source": [ - "# @title Configure the chapter to process\n", - "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", - "almanac_year = 2024 #@param {type: \"integer\"}\n", - "chapter_name = 'privacy' #@param {type: \"string\"}\n", - "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UzhgG5xvbQ1E", - "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" - }, - "outputs": [], - "source": [ - "# @title Update chapter branch\n", - "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", - "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "45dBifFPJAtO" - }, - "outputs": [], - "source": [ - "# @title Authenticate\n", - "import google.auth\n", - "import os\n", - "from google.colab import auth\n", - "from google.cloud import bigquery\n", - "\n", - "import gspread\n", - "from gspread_dataframe import set_with_dataframe\n", - "\n", - "\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", - "auth.authenticate_user()\n", - "credentials, project = google.auth.default()\n", - "client = bigquery.Client()\n", - "gc = gspread.authorize(credentials)\n", - "\n", - "try:\n", - " ss = gc.open_by_url(spreadsheet_url)\n", - "except:\n", - " print('Spreadsheet not found')" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "nblNil985Tjt", - "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" - }, - "outputs": [], - "source": [ - "# @title Upload query results\n", - "\n", - "import glob\n", - "import re\n", - "from tabulate import tabulate\n", - "from IPython.display import clear_output\n", - "\n", - "\n", - "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "dry_run = True # @param {type: \"boolean\"}\n", - "overwrite_sheets = True # @param {type: \"boolean\"}\n", - "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", - "\n", - "filename_include_regexp = r'{}'.format(filename_match)\n", - "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", - "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", - " year=almanac_year,\n", - " chapter=chapter_name.lower()\n", - ")\n", - "existing_sheets = [s.title for s in ss.worksheets()]\n", - "\n", - "# Print formatted logs\n", - "queries_processed_log = []\n", - "def print_logs_table(log=None, append=True):\n", - " if log:\n", - " queries_processed_log.append(log)\n", - " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", - " if not append:\n", - " del queries_processed_log[-1]\n", - " clear_output(wait=True)\n", - " print(table)\n", - "\n", - "# Find matching SQL queries and save results to Google Sheets.\n", - "for filepath in sorted(glob.iglob(folder)):\n", - " filename = filepath.split('/')[-1]\n", - "\n", - " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", - "\n", - " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", - "\n", - " with open(filepath) as f:\n", - " query = f.read()\n", - "\n", - " try:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(dry_run = True)\n", - " )\n", - " except Exception as e:\n", - " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", - " continue\n", - "\n", - " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", - " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", - "\n", - " if sheet_title in existing_sheets:\n", - " if overwrite_sheets:\n", - " st = ss.worksheet(sheet_title)\n", - " else:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", - " continue\n", - "\n", - " if dry_run:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", - " continue\n", - "\n", - " try:\n", - " if maximum_tb_billed:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(\n", - " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", - " )\n", - " )\n", - " else:\n", - " response = client.query(query)\n", - "\n", - " df = response.to_dataframe()\n", - " if ('st' not in locals() or st.title != sheet_title):\n", - " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", - " set_with_dataframe(st, df, resize=False)\n", - "\n", - " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", - " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", - "\n", - " except Exception as e:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", - " continue\n", - "\n", - " else:\n", - " print_logs_table([filename, None, None, 'Filename mismatch'])" - ] - } - ], - "metadata": { + "id": "OVkCxlRQH6Yt", + "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" + }, + "outputs": [], + "source": [ + "# @title Download repo\n", + "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "U37785Bxt5tE" + }, + "outputs": [], + "source": [ + "# @title Configure the chapter to process\n", + "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", + "almanac_year = 2024 #@param {type: \"integer\"}\n", + "chapter_name = 'privacy' #@param {type: \"string\"}\n", + "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "id": "UzhgG5xvbQ1E", + "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" + }, + "outputs": [], + "source": [ + "# @title Update chapter branch\n", + "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", + "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "45dBifFPJAtO" + }, + "outputs": [], + "source": [ + "# @title Authenticate\n", + "import google.auth\n", + "import os\n", + "from google.colab import auth\n", + "from google.cloud import bigquery\n", + "\n", + "import gspread\n", + "from gspread_dataframe import set_with_dataframe\n", + "\n", + "\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", + "auth.authenticate_user()\n", + "credentials, project = google.auth.default()\n", + "client = bigquery.Client()\n", + "gc = gspread.authorize(credentials)\n", + "\n", + "try:\n", + " ss = gc.open_by_url(spreadsheet_url)\n", + "except:\n", + " print('Spreadsheet not found')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python", - "version": "3.12.4" - } + "collapsed": true, + "id": "nblNil985Tjt", + "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" + }, + "outputs": [], + "source": [ + "# @title Upload query results\n", + "\n", + "import glob\n", + "import re\n", + "from tabulate import tabulate\n", + "from IPython.display import clear_output\n", + "\n", + "\n", + "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "dry_run = True # @param {type: \"boolean\"}\n", + "overwrite_sheets = True # @param {type: \"boolean\"}\n", + "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", + "\n", + "filename_include_regexp = r'{}'.format(filename_match)\n", + "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", + "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", + " year=almanac_year,\n", + " chapter=chapter_name.lower()\n", + ")\n", + "existing_sheets = [s.title for s in ss.worksheets()]\n", + "\n", + "# Print formatted logs\n", + "queries_processed_log = []\n", + "def print_logs_table(log=None, append=True):\n", + " if log:\n", + " queries_processed_log.append(log)\n", + " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", + " if not append:\n", + " del queries_processed_log[-1]\n", + " clear_output(wait=True)\n", + " print(table)\n", + "\n", + "# Find matching SQL queries and save results to Google Sheets.\n", + "for filepath in sorted(glob.iglob(folder)):\n", + " filename = filepath.split('/')[-1]\n", + "\n", + " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", + "\n", + " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", + "\n", + " with open(filepath) as f:\n", + " query = f.read()\n", + "\n", + " try:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(dry_run = True)\n", + " )\n", + " except Exception as e:\n", + " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", + " continue\n", + "\n", + " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", + " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", + "\n", + " if sheet_title in existing_sheets:\n", + " if overwrite_sheets:\n", + " st = ss.worksheet(sheet_title)\n", + " else:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", + " continue\n", + "\n", + " if dry_run:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", + " continue\n", + "\n", + " try:\n", + " if maximum_tb_billed:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(\n", + " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", + " )\n", + " )\n", + " else:\n", + " response = client.query(query)\n", + "\n", + " df = response.to_dataframe()\n", + " if ('st' not in locals() or st.title != sheet_title):\n", + " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", + " set_with_dataframe(st, df, resize=False)\n", + "\n", + " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", + " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", + "\n", + " except Exception as e:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", + " continue\n", + "\n", + " else:\n", + " print_logs_table([filename, None, None, 'Filename mismatch'])" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From 943ae2805c982da03b3be84f0521eef68c5876fe Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 2 Aug 2025 01:48:51 +0200 Subject: [PATCH 02/13] query updates --- sql/2025/privacy/ccpa_most_common_phrases.sql | 31 -------------- sql/2025/privacy/ccpa_prevalence.sql | 27 ------------ .../privacy/cookies_top_first_party_names.sql | 10 ++--- .../cookies_top_third_party_domains.sql | 6 +-- .../privacy/cookies_top_third_party_names.sql | 8 ++-- .../privacy/easylist-tracker-detection.sql | 41 ------------------- .../fingerprinting_most_common_apis.sql | 36 ---------------- .../fingerprinting_most_common_scripts.sql | 23 ----------- .../privacy/fingerprinting_script_count.sql | 21 ---------- sql/2025/privacy/most_common_client_hints.sql | 14 +++---- .../most_common_cmps_for_iab_tcf_v2.sql | 2 +- .../privacy/most_common_cname_domains.sql | 13 +++--- .../privacy/most_common_referrer_policy.sql | 4 +- .../most_common_strings_for_iab_usp.sql | 2 +- ...stered_by_third_parties_and_publishers.sql | 19 ++------- ...er_of_privacy_sandbox_attested_domains.sql | 4 +- .../number_of_websites_with_client_hints.sql | 12 +++--- .../privacy/number_of_websites_with_dnt.sql | 6 +-- .../privacy/number_of_websites_with_gpc.sql | 10 ++--- .../privacy/number_of_websites_with_iab.sql | 16 ++++---- .../number_of_websites_with_nb_trackers.sql | 15 +++---- ...number_of_websites_with_referrerpolicy.sql | 12 +++--- ...of_websites_with_related_origin_trials.sql | 14 +++---- ..._of_websites_with_whotracksme_trackers.sql | 4 +- ...doption-by-third-parties-by-publishers.sql | 19 ++------- ...inations_registered_by_most_publishers.sql | 19 ++------- ...tions_registered_by_most_third_parties.sql | 19 ++------- 27 files changed, 91 insertions(+), 316 deletions(-) delete mode 100644 sql/2025/privacy/ccpa_most_common_phrases.sql delete mode 100644 sql/2025/privacy/ccpa_prevalence.sql delete mode 100644 sql/2025/privacy/easylist-tracker-detection.sql delete mode 100644 sql/2025/privacy/fingerprinting_most_common_apis.sql delete mode 100644 sql/2025/privacy/fingerprinting_most_common_scripts.sql delete mode 100644 sql/2025/privacy/fingerprinting_script_count.sql diff --git a/sql/2025/privacy/ccpa_most_common_phrases.sql b/sql/2025/privacy/ccpa_most_common_phrases.sql deleted file mode 100644 index ae38070672d..00000000000 --- a/sql/2025/privacy/ccpa_most_common_phrases.sql +++ /dev/null @@ -1,31 +0,0 @@ -WITH pages_with_phrase AS ( - SELECT - client, - rank_grouping, - page, - COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, - JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases - FROM `httparchive.crawl.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) - UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping - WHERE date = '2025-07-01' AND - is_root_page = true AND - rank <= rank_grouping AND - array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 -) - -SELECT - client, - rank_grouping, - link_phrase, - COUNT(DISTINCT page) AS num_pages, - COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages -FROM pages_with_phrase, - UNNEST(ccpa_link_phrases) AS link_phrase -GROUP BY - link_phrase, - rank_grouping, - client -ORDER BY - rank_grouping, - client, - num_pages DESC diff --git a/sql/2025/privacy/ccpa_prevalence.sql b/sql/2025/privacy/ccpa_prevalence.sql deleted file mode 100644 index c51db559ae7..00000000000 --- a/sql/2025/privacy/ccpa_prevalence.sql +++ /dev/null @@ -1,27 +0,0 @@ -WITH pages AS ( - SELECT - client, - rank_grouping, - page, - JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link - FROM `httparchive.crawl.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT) - UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping - WHERE date = '2025-07-01' AND - is_root_page = true AND - rank <= rank_grouping -) - -SELECT - client, - rank_grouping, - has_ccpa_link, - COUNT(DISTINCT page) AS num_pages -FROM pages -GROUP BY - has_ccpa_link, - rank_grouping, - client -ORDER BY - rank_grouping, - client, - has_ccpa_link diff --git a/sql/2025/privacy/cookies_top_first_party_names.sql b/sql/2025/privacy/cookies_top_first_party_names.sql index 5b310e6fb75..c9d689c6c06 100644 --- a/sql/2025/privacy/cookies_top_first_party_names.sql +++ b/sql/2025/privacy/cookies_top_first_party_names.sql @@ -5,7 +5,7 @@ WITH pages AS ( client, root_page, custom_metrics, - COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains + COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' ), @@ -14,18 +14,18 @@ cookies AS ( SELECT client, cookie, - NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host, NET.HOST(root_page) AS firstparty_host, total_domains FROM pages, - UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie + UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie ) SELECT client, COUNT(DISTINCT firstparty_host) AS domain_count, - COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, - JSON_VALUE(cookie, '$.name') AS cookie_name + COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains, + SAFE.STRING(cookie.name) AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY diff --git a/sql/2025/privacy/cookies_top_third_party_domains.sql b/sql/2025/privacy/cookies_top_third_party_domains.sql index c8feb446e42..a4d1ea09aaa 100644 --- a/sql/2025/privacy/cookies_top_third_party_domains.sql +++ b/sql/2025/privacy/cookies_top_third_party_domains.sql @@ -14,18 +14,18 @@ cookies AS ( client, page, cookie, - NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host, NET.HOST(root_page) AS firstparty_host, total_pages FROM pages, - UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie + UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie ) SELECT client, cookie_host, COUNT(DISTINCT page) AS page_count, - COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY diff --git a/sql/2025/privacy/cookies_top_third_party_names.sql b/sql/2025/privacy/cookies_top_third_party_names.sql index 981a77da56d..8c5eb2cbc0d 100644 --- a/sql/2025/privacy/cookies_top_third_party_names.sql +++ b/sql/2025/privacy/cookies_top_third_party_names.sql @@ -14,18 +14,18 @@ cookies AS ( SELECT client, cookie, - NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host, NET.HOST(root_page) AS firstparty_host, total_domains FROM pages, - UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie + UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie ) SELECT client, COUNT(DISTINCT firstparty_host) AS domain_count, - COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, - JSON_VALUE(cookie, '$.name') AS cookie_name + COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains, + SAFE.STRING(cookie.name) AS cookie_name FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY diff --git a/sql/2025/privacy/easylist-tracker-detection.sql b/sql/2025/privacy/easylist-tracker-detection.sql deleted file mode 100644 index 15a9e2f5115..00000000000 --- a/sql/2025/privacy/easylist-tracker-detection.sql +++ /dev/null @@ -1,41 +0,0 @@ -CREATE TEMP FUNCTION -CheckDomainInURL(url STRING, domain STRING) -RETURNS INT64 -LANGUAGE js AS """ - return url.includes(domain) ? 1 : 0; -"""; - --- We need to use the `easylist_adservers.csv` to populate the table to get the list of domains to block --- https://github.com/easylist/easylist/blob/master/easylist/easylist_adservers.txt -WITH easylist_data AS ( - SELECT string_field_0 - FROM `httparchive.almanac.easylist_adservers` -), - -requests_data AS ( - SELECT url - FROM `httparchive.all.requests` - WHERE - date = '2025-07-01' AND - is_root_page = TRUE -), - -block_status AS ( - SELECT - r.url, - MAX( - CASE - WHEN CheckDomainInURL(r.url, e.string_field_0) = 1 THEN 1 - ELSE 0 - END - ) AS should_block - FROM requests_data r - LEFT JOIN easylist_data e - ON CheckDomainInURL(r.url, e.string_field_0) = 1 - GROUP BY r.url -) - -SELECT - COUNT(0) AS blocked_url_count -FROM block_status -WHERE should_block = 1; diff --git a/sql/2025/privacy/fingerprinting_most_common_apis.sql b/sql/2025/privacy/fingerprinting_most_common_apis.sql deleted file mode 100644 index f7d952e0ad5..00000000000 --- a/sql/2025/privacy/fingerprinting_most_common_apis.sql +++ /dev/null @@ -1,36 +0,0 @@ -CREATE TEMP FUNCTION getFingerprintingTypes(input STRING) -RETURNS ARRAY -LANGUAGE js AS """ -if (input) { - try { - return Object.keys(JSON.parse(input)) - } catch (e) { - return [] - } -} else { - return [] -} -"""; - -WITH pages AS ( - SELECT - client, - page, - fingerprinting_type, - COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages - FROM `httparchive.crawl.pages`, - UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type - WHERE date = '2025-07-01' -) - -SELECT - client, - fingerprinting_type, - COUNT(DISTINCT page) AS page_count, - COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages -FROM pages -GROUP BY - client, - fingerprinting_type -ORDER BY - page_count DESC diff --git a/sql/2025/privacy/fingerprinting_most_common_scripts.sql b/sql/2025/privacy/fingerprinting_most_common_scripts.sql deleted file mode 100644 index 316c07b50d8..00000000000 --- a/sql/2025/privacy/fingerprinting_most_common_scripts.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH pages AS ( - SELECT - page, - client, - custom_metrics, - COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages - FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' -) - -SELECT - client, - script, - COUNT(DISTINCT page) AS page_count, - COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages -FROM pages, - UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script -GROUP BY - client, - script -ORDER BY - page_count DESC -LIMIT 100; diff --git a/sql/2025/privacy/fingerprinting_script_count.sql b/sql/2025/privacy/fingerprinting_script_count.sql deleted file mode 100644 index 3ca08b05326..00000000000 --- a/sql/2025/privacy/fingerprinting_script_count.sql +++ /dev/null @@ -1,21 +0,0 @@ -WITH pages AS ( - SELECT - page, - client, - ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, - COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages - FROM `httparchive.crawl.pages` - WHERE date = '2025-07-01' -) - -SELECT - script_count, - client, - COUNT(DISTINCT page) AS page_count, - COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages -FROM pages -GROUP BY - script_count, - client -ORDER BY - script_count ASC; diff --git a/sql/2025/privacy/most_common_client_hints.sql b/sql/2025/privacy/most_common_client_hints.sql index 88c2267abf2..8358eb5c884 100644 --- a/sql/2025/privacy/most_common_client_hints.sql +++ b/sql/2025/privacy/most_common_client_hints.sql @@ -6,8 +6,8 @@ WITH response_headers AS ( LOWER(response_header.name) AS header_name, LOWER(response_header.value) AS header_value, COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites - FROM `httparchive.all.requests`, - UNNEST(response_headers) response_header + FROM `httparchive.crawl.requests`, + UNNEST(response_headers) AS response_header WHERE date = '2025-07-01' AND is_root_page = TRUE AND @@ -18,20 +18,20 @@ meta_tags AS ( SELECT client, page, - LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, - LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name, + LOWER(SAFE.STRING(meta_node.content)) AS tag_value FROM ( SELECT client, page, - JSON_QUERY(custom_metrics, '$.almanac') AS metrics + custom_metrics.other.almanac AS metrics FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND is_root_page = TRUE ), - UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node - WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL + UNNEST(JSON_QUERY_ARRAY(metrics.`meta-nodes`.nodes)) AS meta_node + WHERE SAFE.STRING(meta_node.`http-equiv`) IS NOT NULL ) SELECT diff --git a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql index 53f76c63a2f..e3952f1925c 100644 --- a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql +++ b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql @@ -6,7 +6,7 @@ WITH cmps AS ( SELECT client, page, - JSON_VALUE(custom_metrics, '$.privacy.iab_tcf_v2.data.cmpId') AS cmpId, + SAFE.STRING(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId, COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.crawl.pages` WHERE diff --git a/sql/2025/privacy/most_common_cname_domains.sql b/sql/2025/privacy/most_common_cname_domains.sql index b260273dc7c..625a1895933 100644 --- a/sql/2025/privacy/most_common_cname_domains.sql +++ b/sql/2025/privacy/most_common_cname_domains.sql @@ -1,9 +1,8 @@ # Most common CNAME domains -CREATE TEMP FUNCTION convert_cname_json(json_str STRING) +CREATE TEMP FUNCTION CONVERT_CNAME_JSON(obj JSON) RETURNS ARRAY> LANGUAGE js AS """ try { - const obj = JSON.parse(json_str); const result = []; for (const key in obj) { result.push({ @@ -37,10 +36,10 @@ cnames AS ( SELECT client, cnames.cname, - page - --ARRAY_AGG(DISTINCT page LIMIT 2) AS page_examples + page, + ARRAY_AGG(DISTINCT page LIMIT 2) AS page_examples FROM `httparchive.crawl.pages`, - UNNEST(convert_cname_json(JSON_QUERY(custom_metrics, '$.privacy.request_hostnames_with_cname'))) AS cnames + UNNEST(CONVERT_CNAME_JSON(custom_metrics.privacy.request_hostnames_with_cname)) AS cnames WHERE date = '2025-07-01' AND NET.REG_DOMAIN(cnames.origin) = NET.REG_DOMAIN(page) AND NET.REG_DOMAIN(cnames.cname) != NET.REG_DOMAIN(page) @@ -65,8 +64,8 @@ cname_stats AS ( NET.REG_DOMAIN(cname) AS cname, adguard_trackers.domain IS NOT NULL AS adguard_known_cname, whotracksme.category AS whotracksme_category, - COUNT(DISTINCT page) AS number_of_pages - --ANY_VALUE(page_examples) + COUNT(DISTINCT page) AS number_of_pages, + ANY_VALUE(page_examples) FROM cnames LEFT JOIN adguard_trackers ON ENDS_WITH(cnames.cname, adguard_trackers.domain) diff --git a/sql/2025/privacy/most_common_referrer_policy.sql b/sql/2025/privacy/most_common_referrer_policy.sql index de96ae3a3a7..2688340af9e 100644 --- a/sql/2025/privacy/most_common_referrer_policy.sql +++ b/sql/2025/privacy/most_common_referrer_policy.sql @@ -17,7 +17,7 @@ referrer_policy_custom_metrics AS ( page, LOWER(TRIM(policy_meta)) AS policy_meta FROM `httparchive.crawl.pages`, - UNNEST(SPLIT(JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy'), ',')) AS policy_meta + UNNEST(SPLIT(SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy), ',')) AS policy_meta WHERE date = '2025-07-01' AND is_root_page = TRUE @@ -29,7 +29,7 @@ response_headers AS ( page, LOWER(response_header.name) AS name, LOWER(response_header.value) AS value - FROM `httparchive.all.requests`, + FROM `httparchive.crawl.requests`, UNNEST(response_headers) AS response_header WHERE date = '2025-07-01' AND diff --git a/sql/2025/privacy/most_common_strings_for_iab_usp.sql b/sql/2025/privacy/most_common_strings_for_iab_usp.sql index d6cfe4af0c9..1a447353ae9 100644 --- a/sql/2025/privacy/most_common_strings_for_iab_usp.sql +++ b/sql/2025/privacy/most_common_strings_for_iab_usp.sql @@ -5,7 +5,7 @@ WITH usp_data AS ( SELECT client, page, - JSON_VALUE(custom_metrics, '$.privacy.iab_usp.privacy_string.uspString') AS uspString, + SAFE.STRING(custom_metrics.privacy.iab_usp.privacy_string.uspString) AS uspString, COUNT(DISTINCT page) OVER (PARTITION BY client) AS pages_total FROM `httparchive.crawl.pages` WHERE diff --git a/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql index c8fd1e038a0..b52d5491b09 100644 --- a/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql +++ b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql @@ -1,24 +1,13 @@ #standardSQL # Number of Attribution Reporting API Destinations (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) --- Extracting third-parties observed using ARA API on a publisher -CREATE TEMP FUNCTION jsonObjectKeys(input STRING) -RETURNS ARRAY -LANGUAGE js AS """ - if (!input) { - return []; - } - return Object.keys(JSON.parse(input)); -"""; - -- Extracting ARA API source registration details being passed by a given third-party (passed AS "key") -CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) RETURNS ARRAY LANGUAGE js AS """ - if (!input) { + if (!jsonObject) { return []; } - const jsonObject = JSON.parse(input); const values = jsonObject[key] || []; const result = []; @@ -59,8 +48,8 @@ WITH ara_features AS ( END AS destination, third_party_domain FROM `httparchive.crawl.pages`, - UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, - UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + UNNEST(JSON_KEYS(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage)) AS third_party_domain, + UNNEST(JSON_OBJECT_VALUES(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage, third_party_domain)) AS ara WHERE date = '2025-07-01' AND is_root_page = TRUE AND diff --git a/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql index d439a0a9c1f..7bba913cc74 100644 --- a/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql +++ b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql @@ -13,8 +13,8 @@ WITH wellknown AS ( WHEN rank <= 10000000 THEN '10000000' ELSE 'Other' END AS rank_group, - CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/related-website-set.json".found') AS BOOL) AS rws, - CAST(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/privacy-sandbox-attestations.json".found') AS BOOL) AS attestation + SAFE.BOOL(custom_metrics.other.`well-known`.`/.well-known/related-website-set.json`.found) AS rws, + SAFE.BOOL(custom_metrics.other.`well-known`.`/.well-known/privacy-sandbox-attestations.json`.found) AS attestation FROM `httparchive.crawl.pages` WHERE diff --git a/sql/2025/privacy/number_of_websites_with_client_hints.sql b/sql/2025/privacy/number_of_websites_with_client_hints.sql index 7953b46f929..a4fef23d131 100644 --- a/sql/2025/privacy/number_of_websites_with_client_hints.sql +++ b/sql/2025/privacy/number_of_websites_with_client_hints.sql @@ -5,7 +5,7 @@ WITH response_headers AS ( LOWER(response_header.name) AS header_name, LOWER(response_header.value) AS header_value, COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites - FROM `httparchive.all.requests`, + FROM `httparchive.crawl.requests`, UNNEST(response_headers) response_header WHERE date = '2025-07-01' AND @@ -16,18 +16,18 @@ meta_tags AS ( SELECT client, page, - LOWER(JSON_VALUE(meta_node, '$.http-equiv')) AS tag_name, - LOWER(JSON_VALUE(meta_node, '$.content')) AS tag_value + LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name, + LOWER(SAFE.STRING(meta_node.content)) AS tag_value FROM ( SELECT client, page, - JSON_VALUE(custom_metrics, '$.almanac') AS metrics + custom_metrics.other.almanac AS metrics FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' ), - UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node - WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL + UNNEST(JSON_QUERY_ARRAY(metrics.`meta-nodes`.nodes)) AS meta_node + WHERE SAFE.STRING(meta_node.`http-equiv`) IS NOT NULL ) SELECT diff --git a/sql/2025/privacy/number_of_websites_with_dnt.sql b/sql/2025/privacy/number_of_websites_with_dnt.sql index 8cace44174e..66d50089d83 100644 --- a/sql/2025/privacy/number_of_websites_with_dnt.sql +++ b/sql/2025/privacy/number_of_websites_with_dnt.sql @@ -7,15 +7,15 @@ WITH blink AS ( pct_urls FROM `httparchive.blink_features.usage` WHERE - yyyymmdd = '20250601' AND + date = '2025-07-01' AND feature IN ('NavigatorDoNotTrack') ), pages AS ( SELECT client, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) AS num_urls, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_doNotTrack') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_urls + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.privacy.navigator_doNotTrack), page, NULL)) AS num_urls, + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.privacy.navigator_doNotTrack), page, NULL)) / COUNT(DISTINCT page) AS pct_urls FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND diff --git a/sql/2025/privacy/number_of_websites_with_gpc.sql b/sql/2025/privacy/number_of_websites_with_gpc.sql index 2b03afb7ebc..bf903a0ea42 100644 --- a/sql/2025/privacy/number_of_websites_with_gpc.sql +++ b/sql/2025/privacy/number_of_websites_with_gpc.sql @@ -3,10 +3,10 @@ WITH pages AS ( SELECT client, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_well_known, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.well-known."/.well-known/gpc.json".found') = 'true', page, NULL)) AS number_of_pages_well_known, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_js_api, - COUNT(DISTINCT IF(JSON_VALUE(custom_metrics, '$.privacy.navigator_globalPrivacyControl') = 'true', page, NULL)) AS number_of_pages_js_api + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.other.`well-known`.`/.well-known/gpc.json`.found), page, NULL)) / COUNT(DISTINCT page) AS pct_pages_well_known, + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.other.`well-known`.`/.well-known/gpc.json`.found), page, NULL)) AS number_of_pages_well_known, + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.privacy.navigator_globalPrivacyControl), page, NULL)) / COUNT(DISTINCT page) AS pct_pages_js_api, + COUNT(DISTINCT IF(SAFE.BOOL(custom_metrics.privacy.navigator_globalPrivacyControl), page, NULL)) AS number_of_pages_js_api FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND @@ -19,7 +19,7 @@ headers AS ( client, COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) / COUNT(DISTINCT page) AS pct_pages_headers, COUNT(DISTINCT IF(headers.name = 'sec-gpc' AND headers.value = '1', page, NULL)) AS number_of_pages_headers - FROM `httparchive.all.requests`, + FROM `httparchive.crawl.requests`, UNNEST(response_headers) headers WHERE date = '2025-07-01' AND diff --git a/sql/2025/privacy/number_of_websites_with_iab.sql b/sql/2025/privacy/number_of_websites_with_iab.sql index a2090b5df38..4865ef9cf4a 100644 --- a/sql/2025/privacy/number_of_websites_with_iab.sql +++ b/sql/2025/privacy/number_of_websites_with_iab.sql @@ -4,7 +4,7 @@ WITH privacy_custom_metrics_data AS ( SELECT client, - JSON_QUERY(custom_metrics, '$.privacy') AS metrics + custom_metrics.privacy AS metrics FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND @@ -47,13 +47,13 @@ FROM ( FROM ( SELECT client, - JSON_VALUE(metrics, '$.iab_tcf_v1.present') = 'true' AS tcfv1, - JSON_VALUE(metrics, '$.iab_tcf_v2.present') = 'true' AS tcfv2, - JSON_VALUE(metrics, '$.iab_gpp.present') = 'true' AS gpp, - JSON_VALUE(metrics, '$.iab_usp.present') = 'true' AS usp, - JSON_VALUE(metrics, '$.iab_tcf_v1.compliant_setup') = 'true' AS tcfv1_compliant, - JSON_VALUE(metrics, '$.iab_tcf_v2.compliant_setup') = 'true' AS tcfv2_compliant, - JSON_VALUE(metrics, '$.iab_gpp.data') IS NOT NULL AS gpp_data + SAFE.BOOL(metrics.iab_tcf_v1.present) AS tcfv1, + SAFE.BOOL(metrics.iab_tcf_v2.present) AS tcfv2, + SAFE.BOOL(metrics.iab_gpp.present) AS gpp, + SAFE.BOOL(metrics.iab_usp.present) AS usp, + SAFE.BOOL(metrics.iab_tcf_v1.compliant_setup) AS tcfv1_compliant, + SAFE.BOOL(metrics.iab_tcf_v2.compliant_setup) AS tcfv2_compliant, + metrics.iab_gpp.data IS NOT NULL AS gpp_data FROM privacy_custom_metrics_data ) diff --git a/sql/2025/privacy/number_of_websites_with_nb_trackers.sql b/sql/2025/privacy/number_of_websites_with_nb_trackers.sql index e8570c65e93..b54ad6d93cf 100644 --- a/sql/2025/privacy/number_of_websites_with_nb_trackers.sql +++ b/sql/2025/privacy/number_of_websites_with_nb_trackers.sql @@ -1,10 +1,11 @@ -# Number of websites that deploy a certain number of trackers +-- Number of websites that deploy a certain number of trackers + WITH whotracksme AS ( SELECT domain, category, tracker - FROM almanac.whotracksme + FROM `httparchive.almanac.whotracksme` WHERE date = '2025-07-01' ), @@ -12,7 +13,7 @@ totals AS ( SELECT client, COUNT(DISTINCT page) AS total_websites - FROM httparchive.crawl.requests + FROM `httparchive.crawl.requests` WHERE date = '2025-07-01' GROUP BY client ) @@ -29,7 +30,7 @@ FROM ( client, page, COUNT(DISTINCT tracker) AS number_of_trackers - FROM httparchive.crawl.requests + FROM `httparchive.crawl.requests` JOIN whotracksme ON ( NET.HOST(url) = domain OR @@ -62,12 +63,12 @@ FROM ( client, page, COUNT(DISTINCT tracker) AS number_of_trackers - FROM httparchive.almanac.requests + FROM `httparchive.crawl.requests` JOIN whotracksme ON ( - NET.HOST(urlShort) = domain OR - ENDS_WITH(NET.HOST(urlShort), CONCAT('.', domain)) + NET.HOST(url) = domain OR + ENDS_WITH(NET.HOST(url), CONCAT('.', domain)) ) WHERE date = '2025-07-01' AND diff --git a/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql b/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql index 0b91d2169ab..92fb30a988d 100644 --- a/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql +++ b/sql/2025/privacy/number_of_websites_with_referrerpolicy.sql @@ -2,11 +2,10 @@ WITH referrer_policy_custom_metrics AS ( SELECT client, page, - JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.entire_document_policy') AS meta_policy, - ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.referrerPolicy.individual_requests')) > 0 AS individual_requests, - CAST(JSON_VALUE(custom_metrics, '$.privacy.referrerPolicy.link_relations.A') AS INT64) > 0 AS link_relations - FROM - `httparchive.crawl.pages` + SAFE.STRING(custom_metrics.privacy.referrerPolicy.entire_document_policy) AS meta_policy, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics.privacy.referrerPolicy.individual_requests)) > 0 AS individual_requests, + SAFE.INT64(custom_metrics.privacy.referrerPolicy.link_relations.A) > 0 AS link_relations + FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND is_root_page = TRUE @@ -17,8 +16,7 @@ referrer_policy_headers AS ( client, page, LOWER(response_header.value) AS header_policy - FROM - `httparchive.all.requests`, + FROM `httparchive.crawl.requests`, UNNEST(response_headers) AS response_header WHERE date = '2025-07-01' AND diff --git a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql index 591c302fa86..89e9eba8ccd 100644 --- a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql @@ -26,8 +26,8 @@ WITH pages AS ( SELECT client, page, - JSON_QUERY(custom_metrics, '$.origin-trials') AS ot_metrics, - JSON_QUERY(custom_metrics, '$.almanac') AS almanac_metrics + custom_metrics.other.`origin-trials` AS ot_metrics, + custom_metrics.other.almanac AS almanac_metrics FROM `httparchive.crawl.pages` WHERE date = '2025-07-01' AND @@ -39,7 +39,7 @@ response_headers AS ( client, page, PARSE_ORIGIN_TRIAL(response_header.value) AS ot -- may not lowercase this value as it is a base64 string - FROM `httparchive.all.requests`, + FROM `httparchive.crawl.requests`, UNNEST(response_headers) response_header WHERE date = '2025-07-01' AND @@ -52,18 +52,18 @@ meta_tags AS ( SELECT client, page, - PARSE_ORIGIN_TRIAL(JSON_VALUE(meta_node, '$.content')) AS ot -- may not lowercase this value as it is a base64 string + PARSE_ORIGIN_TRIAL(SAFE.STRING(meta_node.content)) AS ot -- may not lowercase this value as it is a base64 string FROM pages, - UNNEST(JSON_QUERY_ARRAY(almanac_metrics, '$.meta-nodes.nodes')) meta_node + UNNEST(JSON_QUERY_ARRAY(almanac_metrics.`meta-nodes`.nodes)) meta_node WHERE - LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = 'origin-trial' + LOWER(SAFE.STRING(meta_node.`http-equiv`)) = 'origin-trial' ), ot_from_custom_metric AS ( SELECT client, page, - PARSE_ORIGIN_TRIAL(JSON_VALUE(metric, '$.token')) AS ot + PARSE_ORIGIN_TRIAL(SAFE.STRING(metric.token)) AS ot FROM pages, UNNEST(JSON_QUERY_ARRAY(ot_metrics)) metric ) diff --git a/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql b/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql index 209061c0e72..9691070ebf9 100644 --- a/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql +++ b/sql/2025/privacy/number_of_websites_with_whotracksme_trackers.sql @@ -3,7 +3,7 @@ WITH whotracksme AS ( domain, category, tracker - FROM `max-ostapenko.Public.whotracksme` + FROM `httparchive.almanac.whotracksme` WHERE date = '2025-07-01' ), @@ -14,7 +14,7 @@ pre_aggregated AS ( page, tracker, COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages - FROM `httparchive.all.requests` + FROM `httparchive.crawl.requests` JOIN whotracksme ON NET.REG_DOMAIN(url) = domain WHERE diff --git a/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql index 9d46cd2c71e..a971f2e560d 100644 --- a/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql +++ b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql @@ -1,24 +1,13 @@ #standardSQL # Adoption of different Privacy Sandbox (PS) features by different third-parties and by different publishers --- Extracting third-parties observed using PS APIs on a publisher -CREATE TEMP FUNCTION jsonObjectKeys(input STRING) -RETURNS ARRAY -LANGUAGE js AS """ - if (!input) { - return []; - } - return Object.keys(JSON.parse(input)); -"""; - -- Extracting PS APIs being called by a given third-party (passed as "key") -CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) RETURNS ARRAY LANGUAGE js AS """ - if (!input) { + if (!jsonObject) { return []; } - const jsonObject = JSON.parse(input); const values = jsonObject[key] || []; function splitByDelimiters(value) { @@ -80,8 +69,8 @@ WITH privacy_sandbox_features AS ( api END AS feature FROM `httparchive.crawl.pages`, - UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, - UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS api + UNNEST(JSON_KEYS(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage)) AS third_party_domain, + UNNEST(JSON_OBJECT_VALUES(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage, third_party_domain)) AS api WHERE date = '2025-07-01' AND is_root_page = TRUE diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql index 6e192dda53a..fe47074c3c2 100644 --- a/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql @@ -1,24 +1,13 @@ #standardSQL # Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct publishers (at site level) --- Extracting third-parties observed using ARA API on a publisher -CREATE TEMP FUNCTION jsonObjectKeys(input STRING) -RETURNS ARRAY -LANGUAGE js AS """ - if (!input) { - return []; - } - return Object.keys(JSON.parse(input)); -"""; - -- Extracting ARA API source registration details being passed by a given third-party (passed as "key") -CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) RETURNS ARRAY LANGUAGE js AS """ - if (!input) { + if (!jsonObject) { return []; } - const jsonObject = JSON.parse(input); const values = jsonObject[key] || []; const result = []; @@ -53,8 +42,8 @@ WITH ara_features AS ( COUNT(third_party_domain) AS total_third_party_domains, COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains FROM `httparchive.crawl.pages`, - UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, - UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + UNNEST(JSON_KEYS(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage)) AS third_party_domain, + UNNEST(JSON_OBJECT_VALUES(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage, third_party_domain)) AS ara WHERE date = '2025-07-01' AND is_root_page = TRUE AND diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql index 67c9142326f..89da104a5c9 100644 --- a/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql @@ -1,24 +1,13 @@ #standardSQL # Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct third-parties (at site level) --- Extracting third-parties observed using ARA API on a publisher -CREATE TEMP FUNCTION jsonObjectKeys(input STRING) -RETURNS ARRAY -LANGUAGE js AS """ - if (!input) { - return []; - } - return Object.keys(JSON.parse(input)); -"""; - -- Extracting ARA API source registration details being passed by a given third-party (passed as "key") -CREATE TEMP FUNCTION jsonObjectValues(input STRING, key STRING) +CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) RETURNS ARRAY LANGUAGE js AS """ - if (!input) { + if (!jsonObject) { return []; } - const jsonObject = JSON.parse(input); const values = jsonObject[key] || []; const result = []; @@ -53,8 +42,8 @@ WITH ara_features AS ( COUNT(third_party_domain) AS total_third_party_domains, COUNT(DISTINCT third_party_domain) AS distinct_third_party_domains FROM `httparchive.crawl.pages`, - UNNEST(jsonObjectKeys(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'))) AS third_party_domain, - UNNEST(jsonObjectValues(JSON_QUERY(custom_metrics, '$.privacy-sandbox.privacySandBoxAPIUsage'), third_party_domain)) AS ara + UNNEST(JSON_KEYS(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage)) AS third_party_domain, + UNNEST(JSON_OBJECT_VALUES(custom_metrics.other.`privacy-sandbox`.privacySandBoxAPIUsage, third_party_domain)) AS ara WHERE date = '2025-07-01' AND is_root_page = TRUE AND From 451cca9a0a50987bf00b99fc59ef764f7de9d03e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 2 Aug 2025 02:09:37 +0200 Subject: [PATCH 03/13] sheet exporter update --- sql/util/bq_to_sheets.ipynb | 300 ++++++++++++++++++++++++++---------- 1 file changed, 218 insertions(+), 82 deletions(-) diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index b95cd9eab1a..2581abee78c 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -9,19 +9,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OVkCxlRQH6Yt", - "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" + "id": "U37785Bxt5tE" }, "outputs": [], "source": [ - "# @title Download repo\n", - "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" + "# @title Configure the chapter to process\n", + "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", + "almanac_year = 2025 #@param {type: \"integer\"}\n", + "chapter_name = 'privacy' #@param {type: \"string\"}\n", + "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Vdfg06z4I44VZBgzY0BeNCmSHjWcWeYIObJU4K0yZb4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" ] }, { @@ -29,20 +28,21 @@ "execution_count": null, "metadata": { "cellView": "form", - "id": "U37785Bxt5tE" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OVkCxlRQH6Yt", + "outputId": "0e907d5e-3824-4b0c-935d-81e629702390" }, "outputs": [], "source": [ - "# @title Configure the chapter to process\n", - "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", - "almanac_year = 2024 #@param {type: \"integer\"}\n", - "chapter_name = 'privacy' #@param {type: \"string\"}\n", - "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" + "# @title Download repo (skip when running locally)\n", + "# !git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "cellView": "form", "colab": { @@ -51,42 +51,95 @@ "id": "UzhgG5xvbQ1E", "outputId": "9cf3ef02-ec76-43ac-cd63-03edf7f2f619" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Branch: privacy-sql-2025\n" + ] + } + ], "source": [ - "# @title Update chapter branch\n", + "# @title Update chapter branch (skip when running locally)\n", "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", - "!cd almanac.httparchive.org/ && git checkout $branch_name && git pull" + "print(f\"Branch: {branch_name}\")\n", + "# !cd almanac.httparchive.org/ && git checkout $branch_name && git pull" ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run to authenticate if in Colab (skip when running locally)\n", + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "# Run to authenticate if not in Colab\n", + "# Prepare the environments as described in src/README.md\n", + "!pip install gspread gspread_dataframe tabulate -q" + ] + }, + { + "cell_type": "code", + "execution_count": 40, "metadata": { "cellView": "form", "id": "45dBifFPJAtO" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spreadsheet authentication failed: \n", + "Note: Make sure you have access to the spreadsheet and proper Google credentials\n" + ] + } + ], "source": [ "# @title Authenticate\n", "import google.auth\n", "import os\n", - "from google.colab import auth\n", "from google.cloud import bigquery\n", "\n", "import gspread\n", "from gspread_dataframe import set_with_dataframe\n", "\n", - "\n", "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", - "auth.authenticate_user()\n", + "\n", "credentials, project = google.auth.default()\n", "client = bigquery.Client()\n", "gc = gspread.authorize(credentials)\n", "\n", "try:\n", " ss = gc.open_by_url(spreadsheet_url)\n", - "except:\n", - " print('Spreadsheet not found')" + " existing_sheets = [s.title for s in ss.worksheets()]\n", + " print(f\"Successfully connected to spreadsheet with {len(existing_sheets)} existing sheets\")\n", + "except Exception as e:\n", + " print(f'Spreadsheet authentication failed: {e}')\n", + " print(\"Note: Make sure you have access to the spreadsheet and proper Google credentials\")\n", + " ss = None\n", + " existing_sheets = []" ] }, { @@ -101,7 +154,71 @@ "id": "nblNil985Tjt", "outputId": "658cf8f9-cee5-44d0-a6cd-abcabd4038e2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| Query name | TB processed/billed | Sheet name | Upload skipped reason |\n", + "+===========================================================================+=======================+=======================================================================+=========================+\n", + "| cookies_top_first_party_names.sql | 0.081 | Cookies Top First Party Names | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| cookies_top_third_party_domains.sql | 0.083 | Cookies Top Third Party Domains | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| cookies_top_third_party_names.sql | 0.081 | Cookies Top Third Party Names | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_bounce_domains.sql | 6.166 | Most Common Bounce Domains | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_client_hints.sql | 5.217 | Most Common Client Hints | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_cmps_for_iab_tcf_v2.sql | 0.021 | Most Common Cmps For Iab Tcf V2 | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_cname_domains.sql | 0.021 | Most Common Cname Domains | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_countries_for_iab_tcf_v2.sql | 0.02 | Most Common Countries For Iab Tcf V2 | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_referrer_policy.sql | 3.66 | Most Common Referrer Policy | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_strings_for_iab_usp.sql | 0.021 | Most Common Strings For Iab Usp | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_tracker_categories.sql | 0.973 | Most Common Tracker Categories | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_ara_destinations_registered_by_third_parties_and_publishers.sql | 1.573 | Number Of Ara Destinations Registered By Third Parties And Publishers | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_privacy_sandbox_attested_domains.sql | 1.571 | Number Of Privacy Sandbox Attested Domains | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_per_technology.sql | 0.025 | Number Of Websites Per Technology | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_per_technology_category.sql | 0.016 | Number Of Websites Per Technology Category | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_using_each_fingerprinting.sql | 0.025 | Number Of Websites Using Each Fingerprinting | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_client_hints.sql | 2.895 | Number Of Websites With Client Hints | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_dnt.sql | 0.021 | Number Of Websites With Dnt | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_gpc.sql | 5.235 | Number Of Websites With Gpc | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_iab.sql | 0.019 | Number Of Websites With Iab | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_nb_trackers.sql | 0.973 | Number Of Websites With Nb Trackers | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_referrerpolicy.sql | 3.664 | Number Of Websites With Referrerpolicy | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_related_origin_trials.sql | 5.217 | Number Of Websites With Related Origin Trials | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_whotracksme_trackers.sql | 0.978 | Number Of Websites With Whotracksme Trackers | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| privacy-sandbox-adoption-by-third-parties-by-publishers.sql | 1.573 | Privacy Sandbox Adoption By Third Parties By Publishers | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| top_ara_destinations_registered_by_most_publishers.sql | 1.573 | Top Ara Destinations Registered By Most Publishers | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| top_ara_destinations_registered_by_most_third_parties.sql | 1.573 | Top Ara Destinations Registered By Most Third Parties | Dry run |\n", + "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n" + ] + } + ], "source": [ "# @title Upload query results\n", "\n", @@ -109,92 +226,102 @@ "import re\n", "from tabulate import tabulate\n", "from IPython.display import clear_output\n", + "import os\n", "\n", - "\n", - "filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "filename_match_exclude = '(ads_and_sellers_graph)\\.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "filename_match = '\\\\.sql$' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "filename_match_exclude = '^$' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", "dry_run = True # @param {type: \"boolean\"}\n", - "overwrite_sheets = True # @param {type: \"boolean\"}\n", - "maximum_tb_billed = None # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", + "overwrite_sheets = False # @param {type: \"boolean\"}\n", + "maximum_tb_billed = 0.5 # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", "\n", "filename_include_regexp = r'{}'.format(filename_match)\n", "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", - "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n", - " year=almanac_year,\n", - " chapter=chapter_name.lower()\n", - ")\n", - "existing_sheets = [s.title for s in ss.worksheets()]\n", + "\n", + "folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n", + "\n", + "print(f\"Looking for SQL files in: {folder}\")\n", "\n", "# Print formatted logs\n", "queries_processed_log = []\n", "def print_logs_table(log=None, append=True):\n", " if log:\n", " queries_processed_log.append(log)\n", - " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", + " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed - estimate', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", " if not append:\n", " del queries_processed_log[-1]\n", " clear_output(wait=True)\n", " print(table)\n", "\n", "# Find matching SQL queries and save results to Google Sheets.\n", - "for filepath in sorted(glob.iglob(folder)):\n", - " filename = filepath.split('/')[-1]\n", + "sql_files = list(glob.iglob(folder))\n", + "print(f\"Found {len(sql_files)} SQL files\")\n", + "\n", + "if not sql_files:\n", + " print(\"No SQL files found. Check the folder path.\")\n", + "else:\n", + " for filepath in sorted(sql_files):\n", + " filename = os.path.basename(filepath)\n", "\n", - " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", + " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", "\n", - " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", + " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", "\n", - " with open(filepath) as f:\n", - " query = f.read()\n", + " with open(filepath) as f:\n", + " query = f.read()\n", + "\n", + " try:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(dry_run = True)\n", + " )\n", + " except Exception as e:\n", + " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", + " continue\n", "\n", - " try:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(dry_run = True)\n", - " )\n", - " except Exception as e:\n", - " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", - " continue\n", + " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", + " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", "\n", - " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", - " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", + " if sheet_title in existing_sheets:\n", + " if overwrite_sheets:\n", + " st = ss.worksheet(sheet_title)\n", + " else:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", + " continue\n", "\n", - " if sheet_title in existing_sheets:\n", - " if overwrite_sheets:\n", - " st = ss.worksheet(sheet_title)\n", - " else:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", + " if dry_run:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", " continue\n", "\n", - " if dry_run:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", - " continue\n", + " # Skip actual execution if no spreadsheet connection\n", + " if ss is None:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'No spreadsheet connection'])\n", + " continue\n", "\n", - " try:\n", - " if maximum_tb_billed:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(\n", - " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", + " try:\n", + " if maximum_tb_billed:\n", + " response = client.query(\n", + " query,\n", + " job_config = bigquery.QueryJobConfig(\n", + " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", + " )\n", " )\n", - " )\n", - " else:\n", - " response = client.query(query)\n", + " else:\n", + " response = client.query(query)\n", "\n", - " df = response.to_dataframe()\n", - " if ('st' not in locals() or st.title != sheet_title):\n", - " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", - " set_with_dataframe(st, df, resize=False)\n", + " df = response.to_dataframe()\n", + " if ('st' not in locals() or st.title != sheet_title):\n", + " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", + " set_with_dataframe(st, df, resize=False)\n", "\n", - " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", - " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", + " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", + " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", "\n", - " except Exception as e:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", - " continue\n", + " except Exception as e:\n", + " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", + " continue\n", "\n", - " else:\n", - " print_logs_table([filename, None, None, 'Filename mismatch'])" + " else:\n", + " print_logs_table([filename, None, None, 'Filename mismatch'])" ] } ], @@ -203,12 +330,21 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv (3.12.7)", + "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.12.4" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" } }, "nbformat": 4, From e394bb635b2ebb97db409d36e137b93385b89dd9 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 2 Aug 2025 02:18:35 +0200 Subject: [PATCH 04/13] ID update --- sql/util/bq_to_sheets.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index 2581abee78c..6ba694515c4 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": { "cellView": "form", "id": "U37785Bxt5tE" @@ -20,7 +20,7 @@ "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", "almanac_year = 2025 #@param {type: \"integer\"}\n", "chapter_name = 'privacy' #@param {type: \"string\"}\n", - "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Vdfg06z4I44VZBgzY0BeNCmSHjWcWeYIObJU4K0yZb4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" + "spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: \"string\", placeholder:\"Enter spreadsheet URL\"}" ] }, { @@ -144,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": { "cellView": "form", "colab": { From 2a7db9b4a2d2693faaaf64c6cbab1d0de9aac334 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 2 Aug 2025 02:18:47 +0200 Subject: [PATCH 05/13] formatting --- sql/2025/privacy/most_common_bounce_domains.sql | 1 + sql/2025/privacy/most_common_client_hints.sql | 3 ++- sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql | 6 +++--- sql/2025/privacy/most_common_cname_domains.sql | 7 ++++--- .../privacy/most_common_countries_for_iab_tcf_v2.sql | 10 +++++----- sql/2025/privacy/most_common_referrer_policy.sql | 2 +- sql/2025/privacy/most_common_strings_for_iab_usp.sql | 4 ++-- sql/2025/privacy/most_common_tracker_categories.sql | 3 ++- ...ions_registered_by_third_parties_and_publishers.sql | 3 +-- .../number_of_privacy_sandbox_attested_domains.sql | 3 +-- .../number_of_websites_using_each_fingerprinting.sql | 3 ++- sql/2025/privacy/number_of_websites_with_dnt.sql | 2 +- sql/2025/privacy/number_of_websites_with_gpc.sql | 2 +- sql/2025/privacy/number_of_websites_with_iab.sql | 4 ++-- .../number_of_websites_with_related_origin_trials.sql | 3 ++- ...sandbox-adoption-by-third-parties-by-publishers.sql | 3 +-- ..._ara_destinations_registered_by_most_publishers.sql | 3 +-- ...a_destinations_registered_by_most_third_parties.sql | 3 +-- 18 files changed, 33 insertions(+), 32 deletions(-) diff --git a/sql/2025/privacy/most_common_bounce_domains.sql b/sql/2025/privacy/most_common_bounce_domains.sql index 91f007d26f2..b13f7552f8b 100644 --- a/sql/2025/privacy/most_common_bounce_domains.sql +++ b/sql/2025/privacy/most_common_bounce_domains.sql @@ -1,6 +1,7 @@ -- Detection logic explained: -- https://github.com/privacycg/proposals/issues/6 -- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md + WITH redirect_requests AS ( SELECT client, diff --git a/sql/2025/privacy/most_common_client_hints.sql b/sql/2025/privacy/most_common_client_hints.sql index 8358eb5c884..dcf0d4c16da 100644 --- a/sql/2025/privacy/most_common_client_hints.sql +++ b/sql/2025/privacy/most_common_client_hints.sql @@ -1,4 +1,5 @@ -# Pages that use Client Hints +-- Pages that use Client Hints + WITH response_headers AS ( SELECT client, diff --git a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql index e3952f1925c..09dce3f75e8 100644 --- a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql +++ b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql @@ -1,6 +1,6 @@ -# Counts of CMPs using IAB Transparency & Consent Framework -# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata -# CMP vendor list: https://iabeurope.eu/cmp-list/ +-- Counts of CMPs using IAB Transparency & Consent Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata +-- CMP vendor list: https://iabeurope.eu/cmp-list/ WITH cmps AS ( SELECT diff --git a/sql/2025/privacy/most_common_cname_domains.sql b/sql/2025/privacy/most_common_cname_domains.sql index 625a1895933..6f0cb9ead0b 100644 --- a/sql/2025/privacy/most_common_cname_domains.sql +++ b/sql/2025/privacy/most_common_cname_domains.sql @@ -1,4 +1,5 @@ -# Most common CNAME domains +-- Most common CNAME domains + CREATE TEMP FUNCTION CONVERT_CNAME_JSON(obj JSON) RETURNS ARRAY> LANGUAGE js AS """ @@ -16,8 +17,8 @@ try { } """; -# Adguard CNAME Trackers source: -# https://github.com/AdguardTeam/cname-trackers/blob/master/script/src/cloaked-trackers.json +-- Adguard CNAME Trackers source: +-- https://github.com/AdguardTeam/cname-trackers/blob/master/script/src/cloaked-trackers.json WITH adguard_trackers AS ( SELECT domain diff --git a/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql index 891f58fdb62..16dfe503255 100644 --- a/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql +++ b/sql/2025/privacy/most_common_countries_for_iab_tcf_v2.sql @@ -1,8 +1,8 @@ -# Counts of countries for publishers using IAB Transparency & Consent Framework -# cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md#tcdata -# "Country code of the country that determines the legislation of -# reference. Normally corresponds to the country code of the country -# in which the publisher's business entity is established." +-- Counts of countries for publishers using IAB Transparency & Consent Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata +-- "Country code of the country that determines the legislation of +-- reference. Normally corresponds to the country code of the country +-- in which the publisher's business entity is established." WITH totals AS ( SELECT diff --git a/sql/2025/privacy/most_common_referrer_policy.sql b/sql/2025/privacy/most_common_referrer_policy.sql index 2688340af9e..eed79b736e3 100644 --- a/sql/2025/privacy/most_common_referrer_policy.sql +++ b/sql/2025/privacy/most_common_referrer_policy.sql @@ -1,4 +1,4 @@ -# Most common values for Referrer-Policy (at site level) +-- Most common values for Referrer-Policy (at site level) WITH totals AS ( SELECT diff --git a/sql/2025/privacy/most_common_strings_for_iab_usp.sql b/sql/2025/privacy/most_common_strings_for_iab_usp.sql index 1a447353ae9..837b7ff2375 100644 --- a/sql/2025/privacy/most_common_strings_for_iab_usp.sql +++ b/sql/2025/privacy/most_common_strings_for_iab_usp.sql @@ -1,5 +1,5 @@ -# Counts of US Privacy String values for websites using IAB US Privacy Framework -# cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md +-- Counts of US Privacy String values for websites using IAB US Privacy Framework +-- cf. https://github.com/InteractiveAdvertisingBureau/USPrivacy/blob/master/CCPA/US%20Privacy%20String.md WITH usp_data AS ( SELECT diff --git a/sql/2025/privacy/most_common_tracker_categories.sql b/sql/2025/privacy/most_common_tracker_categories.sql index c93aeadcac8..31fe6d707d5 100644 --- a/sql/2025/privacy/most_common_tracker_categories.sql +++ b/sql/2025/privacy/most_common_tracker_categories.sql @@ -1,4 +1,5 @@ -# Percent of pages that deploy at least one tracker from each tracker category +-- Percent of pages that deploy at least one tracker from each tracker category + WITH whotracksme AS ( SELECT domain, diff --git a/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql index b52d5491b09..4b692ee59bd 100644 --- a/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql +++ b/sql/2025/privacy/number_of_ara_destinations_registered_by_third_parties_and_publishers.sql @@ -1,5 +1,4 @@ -#standardSQL -# Number of Attribution Reporting API Destinations (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) +-- Number of Attribution Reporting API Destinations (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) -- Extracting ARA API source registration details being passed by a given third-party (passed AS "key") CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) diff --git a/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql index 7bba913cc74..a2f98701bd4 100644 --- a/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql +++ b/sql/2025/privacy/number_of_privacy_sandbox_attested_domains.sql @@ -1,5 +1,4 @@ -#standardSQL -# Privacy Sandbox Attestation and Related Websites JSON status (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) +-- Privacy Sandbox Attestation and Related Websites JSON status (i.e., advertisers) registered, registering third-parties, and registering publishers (at site level) WITH wellknown AS ( SELECT diff --git a/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql b/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql index 67068c3fca4..da695ed3be7 100644 --- a/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql +++ b/sql/2025/privacy/number_of_websites_using_each_fingerprinting.sql @@ -1,4 +1,5 @@ -# Percent of websites using a fingerprinting library based on wappalyzer category +-- Percent of websites using a fingerprinting library based on wappalyzer category + WITH totals AS ( SELECT client, diff --git a/sql/2025/privacy/number_of_websites_with_dnt.sql b/sql/2025/privacy/number_of_websites_with_dnt.sql index 66d50089d83..9e79e93848c 100644 --- a/sql/2025/privacy/number_of_websites_with_dnt.sql +++ b/sql/2025/privacy/number_of_websites_with_dnt.sql @@ -1,4 +1,4 @@ -# Pages that request DNT status +-- Pages that request DNT status WITH blink AS ( SELECT DISTINCT diff --git a/sql/2025/privacy/number_of_websites_with_gpc.sql b/sql/2025/privacy/number_of_websites_with_gpc.sql index bf903a0ea42..667b7bf9f6d 100644 --- a/sql/2025/privacy/number_of_websites_with_gpc.sql +++ b/sql/2025/privacy/number_of_websites_with_gpc.sql @@ -1,4 +1,4 @@ -# Pages that provide `/.well-known/gpc.json` for Global Privacy Control +-- Pages that provide `/.well-known/gpc.json` for Global Privacy Control WITH pages AS ( SELECT diff --git a/sql/2025/privacy/number_of_websites_with_iab.sql b/sql/2025/privacy/number_of_websites_with_iab.sql index 4865ef9cf4a..667051ca60b 100644 --- a/sql/2025/privacy/number_of_websites_with_iab.sql +++ b/sql/2025/privacy/number_of_websites_with_iab.sql @@ -1,5 +1,5 @@ -# Counts of pages with IAB Frameworks -# TODO: check presence of multiple frameworks per page +-- Counts of pages with IAB Frameworks +-- TODO: check presence of multiple frameworks per page WITH privacy_custom_metrics_data AS ( SELECT diff --git a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql index 89e9eba8ccd..679a8576a2b 100644 --- a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,4 +1,5 @@ -# Pages that participate in the privacy-relayed origin trials +-- Pages that participate in the privacy-relayed origin trials + CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) ); diff --git a/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql index a971f2e560d..df1c63cb830 100644 --- a/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql +++ b/sql/2025/privacy/privacy-sandbox-adoption-by-third-parties-by-publishers.sql @@ -1,5 +1,4 @@ -#standardSQL -# Adoption of different Privacy Sandbox (PS) features by different third-parties and by different publishers +-- Adoption of different Privacy Sandbox (PS) features by different third-parties and by different publishers -- Extracting PS APIs being called by a given third-party (passed as "key") CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql index fe47074c3c2..2ce936f278c 100644 --- a/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_publishers.sql @@ -1,5 +1,4 @@ -#standardSQL -# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct publishers (at site level) +-- Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct publishers (at site level) -- Extracting ARA API source registration details being passed by a given third-party (passed as "key") CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) diff --git a/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql index 89da104a5c9..5150224a2b5 100644 --- a/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql +++ b/sql/2025/privacy/top_ara_destinations_registered_by_most_third_parties.sql @@ -1,5 +1,4 @@ -#standardSQL -# Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct third-parties (at site level) +-- Top 25 Attribution Reporting API Destinations (i.e., advertisers) registered by the most number of distinct third-parties (at site level) -- Extracting ARA API source registration details being passed by a given third-party (passed as "key") CREATE TEMP FUNCTION JSON_OBJECT_VALUES(jsonObject JSON, key STRING) From cd572c8337e2d9367495058113e02e4df0638d02 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 3 Aug 2025 00:34:47 +0200 Subject: [PATCH 06/13] lint --- sql/2025/privacy/most_common_cname_domains.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2025/privacy/most_common_cname_domains.sql b/sql/2025/privacy/most_common_cname_domains.sql index 6f0cb9ead0b..2941e119e34 100644 --- a/sql/2025/privacy/most_common_cname_domains.sql +++ b/sql/2025/privacy/most_common_cname_domains.sql @@ -66,7 +66,7 @@ cname_stats AS ( adguard_trackers.domain IS NOT NULL AS adguard_known_cname, whotracksme.category AS whotracksme_category, COUNT(DISTINCT page) AS number_of_pages, - ANY_VALUE(page_examples) + ANY_VALUE(page_examples) AS page_examples FROM cnames LEFT JOIN adguard_trackers ON ENDS_WITH(cnames.cname, adguard_trackers.domain) From bb5959a7c2ac1cefe8c529c97848b5d9b0290058 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:10:09 +0200 Subject: [PATCH 07/13] Refactor origin trial functions for improved readability and structure --- ...of_websites_with_related_origin_trials.sql | 31 ++++++++++--------- ...of_websites_with_related_origin_trials.sql | 31 ++++++++++--------- sql/util/functions.sql | 24 -------------- 3 files changed, 34 insertions(+), 52 deletions(-) diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql index e55b352eadf..b8f84911ad2 100644 --- a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,24 +1,27 @@ # Pages that participate in the privacy-relayed origin trials -CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( - SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) -); - -CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) -RETURNS STRUCT< +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT +< token STRING, origin STRING, feature STRING, expiry TIMESTAMP, is_subdomain BOOL, is_third_party BOOL -> AS ( - STRUCT( - DECODE_ORIGIN_TRIAL(token) AS token, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party +> +DETERMINISTIC AS ( + ( + WITH decoded_token AS ( + SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded + ) + SELECT STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + ) + FROM decoded_token ) ); diff --git a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql index 679a8576a2b..bc395d721bb 100644 --- a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,25 +1,28 @@ -- Pages that participate in the privacy-relayed origin trials -CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( - SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) -); - -CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) -RETURNS STRUCT< +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT +< token STRING, origin STRING, feature STRING, expiry TIMESTAMP, is_subdomain BOOL, is_third_party BOOL -> AS ( - STRUCT( - DECODE_ORIGIN_TRIAL(token) AS token, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party +> +DETERMINISTIC AS ( + ( + WITH decoded_token AS ( + SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded + ) + SELECT STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + ) + FROM decoded_token ) ); diff --git a/sql/util/functions.sql b/sql/util/functions.sql index becc0ee67f8..b9f861b3683 100644 --- a/sql/util/functions.sql +++ b/sql/util/functions.sql @@ -8,27 +8,3 @@ try { return null; } """; - -# Origin Trials -CREATE OR REPLACE FUNCTION `httparchive.fn.DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS ( - SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) -); - -CREATE OR REPLACE FUNCTION `httparchive.fn.PARSE_ORIGIN_TRIAL`(token STRING) -RETURNS STRUCT< - token STRING, - origin STRING, - feature STRING, - expiry TIMESTAMP, - is_subdomain BOOL, - is_third_party BOOL -> AS ( - STRUCT( - DECODE_ORIGIN_TRIAL(token) AS token, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party - ) -); From 55cec4f91f4af0ae0c1d28eb5b3789ebbc9a9ff5 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:43:47 +0200 Subject: [PATCH 08/13] lint --- ...of_websites_with_related_origin_trials.sql | 19 +++++++++-------- ...of_websites_with_related_origin_trials.sql | 21 ++++++++++--------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql index b8f84911ad2..b9f7e0878e2 100644 --- a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,6 +1,5 @@ # Pages that participate in the privacy-relayed origin trials -CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT -< +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT< token STRING, origin STRING, feature STRING, @@ -13,13 +12,15 @@ DETERMINISTIC AS ( WITH decoded_token AS ( SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded ) - SELECT STRUCT( - decoded AS token, - JSON_VALUE(decoded, '$.origin') AS origin, - JSON_VALUE(decoded, '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + + SELECT + STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party ) FROM decoded_token ) diff --git a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql index bc395d721bb..a667110aab9 100644 --- a/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2025/privacy/number_of_websites_with_related_origin_trials.sql @@ -1,7 +1,6 @@ -- Pages that participate in the privacy-relayed origin trials -CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT -< +CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT< token STRING, origin STRING, feature STRING, @@ -14,14 +13,16 @@ DETERMINISTIC AS ( WITH decoded_token AS ( SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded ) - SELECT STRUCT( - decoded AS token, - JSON_VALUE(decoded, '$.origin') AS origin, - JSON_VALUE(decoded, '$.feature') AS feature, - TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, - JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, - JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party - ) + + SELECT + STRUCT( + decoded AS token, + JSON_VALUE(decoded, '$.origin') AS origin, + JSON_VALUE(decoded, '$.feature') AS feature, + TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, + JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, + JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party + ) FROM decoded_token ) ); From c3a2ee75e05f6b5c49790c71bd5726beda073be7 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 13 Sep 2025 01:00:39 +0200 Subject: [PATCH 09/13] lint --- .../privacy/number_of_websites_with_related_origin_trials.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql index b9f7e0878e2..7a57ed673bd 100644 --- a/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql +++ b/sql/2024/privacy/number_of_websites_with_related_origin_trials.sql @@ -21,7 +21,7 @@ DETERMINISTIC AS ( TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry, JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain, JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party - ) + ) FROM decoded_token ) ); From bd7506d2043b22c321df19d082ea199f363359ee Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 19 Oct 2025 20:33:35 +0200 Subject: [PATCH 10/13] make bq_to_sheets.ipynb runnable and add deps to requirements --- sql/util/bq_to_sheets.ipynb | 207 ++++++++++++++++++------------------ src/requirements.txt | 4 + 2 files changed, 105 insertions(+), 106 deletions(-) diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index 6ba694515c4..c504f20d8dd 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "cellView": "form", "id": "U37785Bxt5tE" @@ -37,12 +37,13 @@ "outputs": [], "source": [ "# @title Download repo (skip when running locally)\n", - "# !git clone https://github.com/HTTPArchive/almanac.httparchive.org.git" + "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git\n", + "!cd almanac.httparchive.org/" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 2, "metadata": { "cellView": "form", "colab": { @@ -56,7 +57,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Branch: privacy-sql-2025\n" + "Branch: privacy-sql-2025\n", + "M\tsql/util/bq_to_sheets.ipynb\n", + "M\tsrc/requirements.txt\n", + "Already on 'privacy-sql-2025'\n", + "Your branch is up to date with 'origin/privacy-sql-2025'.\n", + "Already up to date.\n" ] } ], @@ -64,7 +70,7 @@ "# @title Update chapter branch (skip when running locally)\n", "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", "print(f\"Branch: {branch_name}\")\n", - "# !cd almanac.httparchive.org/ && git checkout $branch_name && git pull" + "!git checkout $branch_name && git pull" ] }, { @@ -74,34 +80,13 @@ "outputs": [], "source": [ "# Run to authenticate if in Colab (skip when running locally)\n", - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "# Run to authenticate if not in Colab\n", - "# Prepare the environments as described in src/README.md\n", - "!pip install gspread gspread_dataframe tabulate -q" + "from google.colab import auth\n", + "auth.authenticate_user()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 3, "metadata": { "cellView": "form", "id": "45dBifFPJAtO" @@ -111,14 +96,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spreadsheet authentication failed: \n", - "Note: Make sure you have access to the spreadsheet and proper Google credentials\n" + "Successfully connected to spreadsheet with 1 existing sheets\n" ] } ], "source": [ "# @title Authenticate\n", "import google.auth\n", + "from google.auth.transport.requests import Request\n", + "from google.oauth2.credentials import Credentials\n", "import os\n", "from google.cloud import bigquery\n", "\n", @@ -127,24 +113,33 @@ "\n", "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", "\n", - "credentials, project = google.auth.default()\n", - "client = bigquery.Client()\n", + "# !gcloud auth application-default login --scopes=https://www.googleapis.com/auth/spreadsheets,https://www.googleapis.com/auth/drive,https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/cloud-platform\n", + "\n", + "# Define the scopes needed for both BigQuery and Google Sheets\n", + "SCOPES = [\n", + " 'https://www.googleapis.com/auth/spreadsheets',\n", + " 'https://www.googleapis.com/auth/drive',\n", + " 'https://www.googleapis.com/auth/bigquery'\n", + "]\n", + "\n", + "# Get credentials with proper scopes\n", + "credentials, project = google.auth.default(scopes=SCOPES)\n", + "\n", + "# Refresh credentials if needed\n", + "if hasattr(credentials, 'refresh') and hasattr(credentials, 'expired') and credentials.expired:\n", + " credentials.refresh(Request())\n", + "\n", + "client = bigquery.Client(credentials=credentials)\n", "gc = gspread.authorize(credentials)\n", "\n", - "try:\n", - " ss = gc.open_by_url(spreadsheet_url)\n", - " existing_sheets = [s.title for s in ss.worksheets()]\n", - " print(f\"Successfully connected to spreadsheet with {len(existing_sheets)} existing sheets\")\n", - "except Exception as e:\n", - " print(f'Spreadsheet authentication failed: {e}')\n", - " print(\"Note: Make sure you have access to the spreadsheet and proper Google credentials\")\n", - " ss = None\n", - " existing_sheets = []" + "ss = gc.open_by_url(spreadsheet_url)\n", + "existing_sheets = [s.title for s in ss.worksheets()]\n", + "print(f\"Successfully connected to spreadsheet with {len(existing_sheets)} existing sheets\")" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 4, "metadata": { "cellView": "form", "colab": { @@ -159,69 +154,68 @@ "name": "stdout", "output_type": "stream", "text": [ - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| Query name | TB processed/billed | Sheet name | Upload skipped reason |\n", - "+===========================================================================+=======================+=======================================================================+=========================+\n", - "| cookies_top_first_party_names.sql | 0.081 | Cookies Top First Party Names | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| cookies_top_third_party_domains.sql | 0.083 | Cookies Top Third Party Domains | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| cookies_top_third_party_names.sql | 0.081 | Cookies Top Third Party Names | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_bounce_domains.sql | 6.166 | Most Common Bounce Domains | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_client_hints.sql | 5.217 | Most Common Client Hints | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_cmps_for_iab_tcf_v2.sql | 0.021 | Most Common Cmps For Iab Tcf V2 | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_cname_domains.sql | 0.021 | Most Common Cname Domains | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_countries_for_iab_tcf_v2.sql | 0.02 | Most Common Countries For Iab Tcf V2 | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_referrer_policy.sql | 3.66 | Most Common Referrer Policy | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_strings_for_iab_usp.sql | 0.021 | Most Common Strings For Iab Usp | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_tracker_categories.sql | 0.973 | Most Common Tracker Categories | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_ara_destinations_registered_by_third_parties_and_publishers.sql | 1.573 | Number Of Ara Destinations Registered By Third Parties And Publishers | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_privacy_sandbox_attested_domains.sql | 1.571 | Number Of Privacy Sandbox Attested Domains | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_per_technology.sql | 0.025 | Number Of Websites Per Technology | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_per_technology_category.sql | 0.016 | Number Of Websites Per Technology Category | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_using_each_fingerprinting.sql | 0.025 | Number Of Websites Using Each Fingerprinting | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_client_hints.sql | 2.895 | Number Of Websites With Client Hints | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_dnt.sql | 0.021 | Number Of Websites With Dnt | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_gpc.sql | 5.235 | Number Of Websites With Gpc | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_iab.sql | 0.019 | Number Of Websites With Iab | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_nb_trackers.sql | 0.973 | Number Of Websites With Nb Trackers | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_referrerpolicy.sql | 3.664 | Number Of Websites With Referrerpolicy | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_related_origin_trials.sql | 5.217 | Number Of Websites With Related Origin Trials | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_whotracksme_trackers.sql | 0.978 | Number Of Websites With Whotracksme Trackers | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| privacy-sandbox-adoption-by-third-parties-by-publishers.sql | 1.573 | Privacy Sandbox Adoption By Third Parties By Publishers | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| top_ara_destinations_registered_by_most_publishers.sql | 1.573 | Top Ara Destinations Registered By Most Publishers | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| top_ara_destinations_registered_by_most_third_parties.sql | 1.573 | Top Ara Destinations Registered By Most Third Parties | Dry run |\n", - "+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+\n" + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| Query name | TB processed - estimate | Sheet name | Upload skipped reason |\n", + "+===========================================================================+===========================+=======================================================================+=========================+\n", + "| cookies_top_first_party_names.sql | 0 | Cookies Top First Party Names | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| cookies_top_third_party_domains.sql | 0 | Cookies Top Third Party Domains | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| cookies_top_third_party_names.sql | 0 | Cookies Top Third Party Names | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_bounce_domains.sql | 1.716 | Most Common Bounce Domains | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_client_hints.sql | 1.337 | Most Common Client Hints | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_cmps_for_iab_tcf_v2.sql | 0.011 | Most Common Cmps For Iab Tcf V2 | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_cname_domains.sql | 0.021 | Most Common Cname Domains | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_countries_for_iab_tcf_v2.sql | 0.02 | Most Common Countries For Iab Tcf V2 | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_referrer_policy.sql | 1.012 | Most Common Referrer Policy | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_strings_for_iab_usp.sql | 0.011 | Most Common Strings For Iab Usp | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| most_common_tracker_categories.sql | 0.973 | Most Common Tracker Categories | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_ara_destinations_registered_by_third_parties_and_publishers.sql | 0.855 | Number Of Ara Destinations Registered By Third Parties And Publishers | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_privacy_sandbox_attested_domains.sql | 0.854 | Number Of Privacy Sandbox Attested Domains | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_per_technology.sql | 0.013 | Number Of Websites Per Technology | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_per_technology_category.sql | 0.008 | Number Of Websites Per Technology Category | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_using_each_fingerprinting.sql | 0.025 | Number Of Websites Using Each Fingerprinting | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_client_hints.sql | 1.863 | Number Of Websites With Client Hints | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_dnt.sql | 0.011 | Number Of Websites With Dnt | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_gpc.sql | 1.346 | Number Of Websites With Gpc | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_iab.sql | 0.01 | Number Of Websites With Iab | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_nb_trackers.sql | 0.973 | Number Of Websites With Nb Trackers | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_referrerpolicy.sql | 0.493 | Number Of Websites With Referrerpolicy | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_related_origin_trials.sql | 2.193 | Number Of Websites With Related Origin Trials | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| number_of_websites_with_whotracksme_trackers.sql | 0.494 | Number Of Websites With Whotracksme Trackers | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| privacy-sandbox-adoption-by-third-parties-by-publishers.sql | 0.855 | Privacy Sandbox Adoption By Third Parties By Publishers | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| top_ara_destinations_registered_by_most_publishers.sql | 0.855 | Top Ara Destinations Registered By Most Publishers | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", + "| top_ara_destinations_registered_by_most_third_parties.sql | 0.855 | Top Ara Destinations Registered By Most Third Parties | |\n", + "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n" ] } ], "source": [ "# @title Upload query results\n", - "\n", "import glob\n", "import re\n", "from tabulate import tabulate\n", @@ -229,13 +223,14 @@ "import os\n", "\n", "filename_match = '\\\\.sql$' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "filename_match_exclude = '^$' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", - "dry_run = True # @param {type: \"boolean\"}\n", + "filename_match_exclude = '' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "dry_run = False # @param {type: \"boolean\"}\n", "overwrite_sheets = False # @param {type: \"boolean\"}\n", - "maximum_tb_billed = 0.5 # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", + "maximum_tb_billed = 7 # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", "\n", - "filename_include_regexp = r'{}'.format(filename_match)\n", - "filename_exclude_regexp = r'{}'.format(filename_match_exclude)\n", + "# Handle empty filename_match and filename_match_exclude\n", + "filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else r'{}'.format(filename_match)\n", + "filename_exclude_regexp = r'^$' if not filename_match_exclude else r'{}'.format(filename_match_exclude)\n", "\n", "folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n", "\n", diff --git a/src/requirements.txt b/src/requirements.txt index 27be9241089..19c398b6671 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -10,3 +10,7 @@ sqlfluff==3.4.2 pandas==2.3.3 google-cloud-bigquery==3.38.0 requests==2.32.5 +db-dtypes==1.4.3 +tabulate==0.9.0 +gspread==6.2.1 +gspread-dataframe==4.0.0 From 08aa531fe0bdba4e581a6b83144a035de4b016fe Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 20:28:53 +0200 Subject: [PATCH 11/13] Refactor privacy queries and utilities; make bq_to_sheets runnable --- .../most_common_cmps_for_iab_tcf_v2.sql | 5 +- .../privacy/number_of_websites_with_iab.sql | 96 ++++-- sql/util/bq_to_sheets.ipynb | 308 +++++++++--------- sql/util/bq_writer.py | 1 - sql/util/haveibeenpwned.py | 59 ++-- sql/util/whotracksme_trackers.py | 2 +- src/requirements.txt | 1 + 7 files changed, 244 insertions(+), 228 deletions(-) diff --git a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql index 09dce3f75e8..6e4541a41d2 100644 --- a/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql +++ b/sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql @@ -6,12 +6,11 @@ WITH cmps AS ( SELECT client, page, - SAFE.STRING(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId, + SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId, COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.crawl.pages` WHERE - date = '2025-07-01' AND - is_root_page = TRUE + date = '2025-07-01' ) SELECT diff --git a/sql/2025/privacy/number_of_websites_with_iab.sql b/sql/2025/privacy/number_of_websites_with_iab.sql index 667051ca60b..017ca4aac6f 100644 --- a/sql/2025/privacy/number_of_websites_with_iab.sql +++ b/sql/2025/privacy/number_of_websites_with_iab.sql @@ -2,36 +2,6 @@ -- TODO: check presence of multiple frameworks per page WITH privacy_custom_metrics_data AS ( - SELECT - client, - custom_metrics.privacy AS metrics - FROM `httparchive.crawl.pages` - WHERE - date = '2025-07-01' AND - is_root_page = TRUE -) - -SELECT - client, - number_of_pages_with_tcfv1 / number_of_pages AS pct_pages_with_tcfv1, - number_of_pages_with_tcfv1, - number_of_pages_with_tcfv2 / number_of_pages AS pct_pages_with_tcfv2, - number_of_pages_with_tcfv2, - number_of_pages_with_usp / number_of_pages AS pct_pages_with_usp, - number_of_pages_with_usp, - number_of_pages_with_tcf / number_of_pages AS pct_pages_with_tcf, - number_of_pages_with_tcf, - number_of_pages_with_any / number_of_pages AS pct_pages_with_any, - number_of_pages_with_any, - number_of_pages_with_tcfv1_compliant / number_of_pages AS pct_pages_with_tcfv1_compliant, - number_of_pages_with_tcfv1_compliant, - number_of_pages_with_tcfv2_compliant / number_of_pages AS pct_pages_with_tcfv2_compliant, - number_of_pages_with_tcfv2_compliant, - number_of_pages_with_gpp / number_of_pages AS pct_pages_with_gpp, - number_of_pages_with_gpp, - number_of_pages_with_gpp_data / number_of_pages AS pct_pages_with_gpp_data, - number_of_pages_with_gpp_data -FROM ( SELECT client, COUNT(0) AS number_of_pages, @@ -54,8 +24,70 @@ FROM ( SAFE.BOOL(metrics.iab_tcf_v1.compliant_setup) AS tcfv1_compliant, SAFE.BOOL(metrics.iab_tcf_v2.compliant_setup) AS tcfv2_compliant, metrics.iab_gpp.data IS NOT NULL AS gpp_data - FROM - privacy_custom_metrics_data + FROM ( + SELECT + client, + custom_metrics.privacy AS metrics + FROM `httparchive.crawl.pages` + WHERE + date = '2025-07-01' AND + is_root_page = TRUE + ) ) GROUP BY client ) + +SELECT + client, + metric.metric, + metric.pct_pages, + metric.number_of_pages +FROM ( + SELECT + client, + ARRAY>[STRUCT( + 'tcfv1', + number_of_pages_with_tcfv1 / number_of_pages, + number_of_pages_with_tcfv1 + ), STRUCT( + 'tcfv2', + number_of_pages_with_tcfv2 / number_of_pages, + number_of_pages_with_tcfv2 + ), STRUCT( + 'usp', + number_of_pages_with_usp / number_of_pages, + number_of_pages_with_usp + ), STRUCT( + 'tcf', + number_of_pages_with_tcf / number_of_pages, + number_of_pages_with_tcf + ), STRUCT( + 'any_framework', + number_of_pages_with_any / number_of_pages, + number_of_pages_with_any + ), STRUCT( + 'tcfv1_compliant', + number_of_pages_with_tcfv1_compliant / number_of_pages, + number_of_pages_with_tcfv1_compliant + ), STRUCT( + 'tcfv2_compliant', + number_of_pages_with_tcfv2_compliant / number_of_pages, + number_of_pages_with_tcfv2_compliant + ), STRUCT( + 'gpp', + number_of_pages_with_gpp / number_of_pages, + number_of_pages_with_gpp + ), STRUCT( + 'gpp_data_available', + number_of_pages_with_gpp_data / number_of_pages, + number_of_pages_with_gpp_data + )] AS metrics + FROM privacy_custom_metrics_data +), + UNNEST(metrics) AS metric +ORDER BY + client; diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb index c504f20d8dd..5c1a47985c8 100644 --- a/sql/util/bq_to_sheets.ipynb +++ b/sql/util/bq_to_sheets.ipynb @@ -9,14 +9,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": { "cellView": "form", "id": "U37785Bxt5tE" }, "outputs": [], "source": [ - "# @title Configure the chapter to process\n", + "# @title Configuration\n", "GCP_PROJECT = 'httparchive' #@param {type: \"string\"}\n", "almanac_year = 2025 #@param {type: \"integer\"}\n", "chapter_name = 'privacy' #@param {type: \"string\"}\n", @@ -36,7 +36,7 @@ }, "outputs": [], "source": [ - "# @title Download repo (skip when running locally)\n", + "# @title Download repo (Colab only - skip when running locally)\n", "!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git\n", "!cd almanac.httparchive.org/" ] @@ -67,9 +67,9 @@ } ], "source": [ - "# @title Update chapter branch (skip when running locally)\n", + "# @title Update chapter branch (Colab only - skip when running locally)\n", "branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'\n", - "print(f\"Branch: {branch_name}\")\n", + "print(f\"Switching to branch: {branch_name}\")\n", "!git checkout $branch_name && git pull" ] }, @@ -79,14 +79,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Run to authenticate if in Colab (skip when running locally)\n", + "# @title Authenticate (Colab only - skip when running locally)\n", "from google.colab import auth\n", "auth.authenticate_user()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "metadata": { "cellView": "form", "id": "45dBifFPJAtO" @@ -96,50 +96,40 @@ "name": "stdout", "output_type": "stream", "text": [ - "Successfully connected to spreadsheet with 1 existing sheets\n" + "✓ Connected to spreadsheet with 28 existing sheets\n" ] } ], "source": [ - "# @title Authenticate\n", + "# @title Setup BigQuery and Google Sheets clients\n", "import google.auth\n", - "from google.auth.transport.requests import Request\n", - "from google.oauth2.credentials import Credentials\n", "import os\n", "from google.cloud import bigquery\n", - "\n", "import gspread\n", "from gspread_dataframe import set_with_dataframe\n", "\n", "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n", "\n", - "# !gcloud auth application-default login --scopes=https://www.googleapis.com/auth/spreadsheets,https://www.googleapis.com/auth/drive,https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/cloud-platform\n", - "\n", - "# Define the scopes needed for both BigQuery and Google Sheets\n", + "# Authenticate with required scopes for BigQuery and Google Sheets\n", "SCOPES = [\n", " 'https://www.googleapis.com/auth/spreadsheets',\n", " 'https://www.googleapis.com/auth/drive',\n", " 'https://www.googleapis.com/auth/bigquery'\n", "]\n", "\n", - "# Get credentials with proper scopes\n", "credentials, project = google.auth.default(scopes=SCOPES)\n", - "\n", - "# Refresh credentials if needed\n", - "if hasattr(credentials, 'refresh') and hasattr(credentials, 'expired') and credentials.expired:\n", - " credentials.refresh(Request())\n", - "\n", "client = bigquery.Client(credentials=credentials)\n", "gc = gspread.authorize(credentials)\n", "\n", + "# Connect to spreadsheet\n", "ss = gc.open_by_url(spreadsheet_url)\n", "existing_sheets = [s.title for s in ss.worksheets()]\n", - "print(f\"Successfully connected to spreadsheet with {len(existing_sheets)} existing sheets\")" + "print(f\"✓ Connected to spreadsheet with {len(existing_sheets)} existing sheets\")" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 38, "metadata": { "cellView": "form", "colab": { @@ -154,169 +144,175 @@ "name": "stdout", "output_type": "stream", "text": [ - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| Query name | TB processed - estimate | Sheet name | Upload skipped reason |\n", - "+===========================================================================+===========================+=======================================================================+=========================+\n", - "| cookies_top_first_party_names.sql | 0 | Cookies Top First Party Names | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| cookies_top_third_party_domains.sql | 0 | Cookies Top Third Party Domains | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| cookies_top_third_party_names.sql | 0 | Cookies Top Third Party Names | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_bounce_domains.sql | 1.716 | Most Common Bounce Domains | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_client_hints.sql | 1.337 | Most Common Client Hints | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_cmps_for_iab_tcf_v2.sql | 0.011 | Most Common Cmps For Iab Tcf V2 | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_cname_domains.sql | 0.021 | Most Common Cname Domains | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_countries_for_iab_tcf_v2.sql | 0.02 | Most Common Countries For Iab Tcf V2 | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_referrer_policy.sql | 1.012 | Most Common Referrer Policy | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_strings_for_iab_usp.sql | 0.011 | Most Common Strings For Iab Usp | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| most_common_tracker_categories.sql | 0.973 | Most Common Tracker Categories | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_ara_destinations_registered_by_third_parties_and_publishers.sql | 0.855 | Number Of Ara Destinations Registered By Third Parties And Publishers | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_privacy_sandbox_attested_domains.sql | 0.854 | Number Of Privacy Sandbox Attested Domains | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_per_technology.sql | 0.013 | Number Of Websites Per Technology | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_per_technology_category.sql | 0.008 | Number Of Websites Per Technology Category | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_using_each_fingerprinting.sql | 0.025 | Number Of Websites Using Each Fingerprinting | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_client_hints.sql | 1.863 | Number Of Websites With Client Hints | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_dnt.sql | 0.011 | Number Of Websites With Dnt | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_gpc.sql | 1.346 | Number Of Websites With Gpc | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_iab.sql | 0.01 | Number Of Websites With Iab | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_nb_trackers.sql | 0.973 | Number Of Websites With Nb Trackers | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_referrerpolicy.sql | 0.493 | Number Of Websites With Referrerpolicy | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_related_origin_trials.sql | 2.193 | Number Of Websites With Related Origin Trials | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| number_of_websites_with_whotracksme_trackers.sql | 0.494 | Number Of Websites With Whotracksme Trackers | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| privacy-sandbox-adoption-by-third-parties-by-publishers.sql | 0.855 | Privacy Sandbox Adoption By Third Parties By Publishers | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| top_ara_destinations_registered_by_most_publishers.sql | 0.855 | Top Ara Destinations Registered By Most Publishers | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n", - "| top_ara_destinations_registered_by_most_third_parties.sql | 0.855 | Top Ara Destinations Registered By Most Third Parties | |\n", - "+---------------------------------------------------------------------------+---------------------------+-----------------------------------------------------------------------+-------------------------+\n" + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| Query | TB Billed | Sheet | Status/Skip Reason |\n", + "+===========================================================================+=============+=================================+==========================+\n", + "| cookies_top_first_party_names.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| cookies_top_third_party_domains.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| cookies_top_third_party_names.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_bounce_domains.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_client_hints.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_cmps_for_iab_tcf_v2.sql | 0 | Most Common Cmps For Iab Tcf V2 | ✓ Uploaded |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_cname_domains.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_countries_for_iab_tcf_v2.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_referrer_policy.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_strings_for_iab_usp.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| most_common_tracker_categories.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_ara_destinations_registered_by_third_parties_and_publishers.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_privacy_sandbox_attested_domains.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_per_technology.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_per_technology_category.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_using_each_fingerprinting.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_client_hints.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_dnt.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_gpc.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_iab.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_nb_trackers.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_referrerpolicy.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_related_origin_trials.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| number_of_websites_with_whotracksme_trackers.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| privacy-sandbox-adoption-by-third-parties-by-publishers.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| top_ara_destinations_registered_by_most_publishers.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "| top_ara_destinations_registered_by_most_third_parties.sql | | | Filename filter mismatch |\n", + "+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+\n", + "\n", + "✓ Processed 27 queries\n" ] } ], "source": [ - "# @title Upload query results\n", + "# @title Upload query results to Google Sheets\n", "import glob\n", "import re\n", "from tabulate import tabulate\n", - "from IPython.display import clear_output\n", - "import os\n", + "from IPython.display import clear_output, display, HTML\n", "\n", - "filename_match = '\\\\.sql$' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", + "# Query filters and options\n", + "filename_match = 'most_common_cmps_for_iab_tcf_v2.sql' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", "filename_match_exclude = '' # @param {type: \"raw\", placeholder: \"Enter regexp wrapped in quotes\"}\n", "dry_run = False # @param {type: \"boolean\"}\n", - "overwrite_sheets = False # @param {type: \"boolean\"}\n", - "maximum_tb_billed = 7 # @param {type: \"raw\", placeholder: \"Insert a number or empty to disable\"}\n", + "overwrite_sheets = True # @param {type: \"boolean\"}\n", + "maximum_tb_billed = 7 # @param {type: \"raw\", placeholder: \"Max TB to bill per query\"}\n", "\n", - "# Handle empty filename_match and filename_match_exclude\n", - "filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else r'{}'.format(filename_match)\n", - "filename_exclude_regexp = r'^$' if not filename_match_exclude else r'{}'.format(filename_match_exclude)\n", + "# Setup file filters\n", + "filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else filename_match\n", + "filename_exclude_regexp = r'^$' if not filename_match_exclude else filename_match_exclude\n", "\n", - "folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n", + "# Build path to SQL files\n", + "sql_folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')\n", + "print(f\"Looking for SQL files in: {sql_folder}\")\n", + "sql_files = sorted(glob.glob(sql_folder))\n", "\n", - "print(f\"Looking for SQL files in: {folder}\")\n", + "if not sql_files:\n", + " print(\"❌ No SQL files found. Check the folder path.\")\n", + "else:\n", + " print(f\"Found {len(sql_files)} SQL files\\n\")\n", "\n", - "# Print formatted logs\n", - "queries_processed_log = []\n", - "def print_logs_table(log=None, append=True):\n", - " if log:\n", - " queries_processed_log.append(log)\n", - " table = tabulate(queries_processed_log, headers=['Query name', 'TB processed - estimate', 'Sheet name', 'Upload skipped reason'], tablefmt=\"grid\")\n", - " if not append:\n", - " del queries_processed_log[-1]\n", - " clear_output(wait=True)\n", - " print(table)\n", + " # Progress tracking\n", + " queries_processed_log = []\n", "\n", - "# Find matching SQL queries and save results to Google Sheets.\n", - "sql_files = list(glob.iglob(folder))\n", - "print(f\"Found {len(sql_files)} SQL files\")\n", + " def log_result(filename, tb_processed=None, sheet_name=None, skip_reason=None, preview=False):\n", + " \"\"\"Add result to log and display table\"\"\"\n", + " log_entry = [filename, tb_processed, sheet_name, skip_reason]\n", + " if not preview:\n", + " queries_processed_log.append(log_entry)\n", "\n", - "if not sql_files:\n", - " print(\"No SQL files found. Check the folder path.\")\n", - "else:\n", - " for filepath in sorted(sql_files):\n", + " # Build table from current log plus preview entry if needed\n", + " display_log = queries_processed_log if not preview else queries_processed_log + [log_entry]\n", + " table = tabulate(display_log, headers=['Query', 'TB Billed', 'Sheet', 'Status/Skip Reason'], tablefmt=\"grid\")\n", + " clear_output(wait=True)\n", + " print(table)\n", + "\n", + " # Process each SQL file\n", + " for filepath in sql_files:\n", " filename = os.path.basename(filepath)\n", "\n", - " print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)\n", + " # Show processing status\n", + " log_result(filename, 'Processing...', 'Processing...', 'Processing...', preview=True)\n", "\n", - " if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):\n", + " # Check if filename matches filters\n", + " if not re.search(filename_include_regexp, filename) or re.search(filename_exclude_regexp, filename):\n", + " log_result(filename, None, None, 'Filename filter mismatch')\n", + " continue\n", "\n", - " with open(filepath) as f:\n", - " query = f.read()\n", + " # Read query\n", + " with open(filepath) as f:\n", + " query = f.read()\n", "\n", - " try:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(dry_run = True)\n", - " )\n", - " except Exception as e:\n", - " print_logs_table([filename, None, None, f'Dry run query error:\\n{e}'])\n", - " continue\n", + " # Estimate query cost (dry run)\n", + " try:\n", + " dry_run_response = client.query(query, job_config=bigquery.QueryJobConfig(dry_run=True))\n", + " tb_processed = dry_run_response.total_bytes_processed / 1024**4\n", + " except Exception as e:\n", + " log_result(filename, None, None, f'Dry run error: {str(e)[:100]}...')\n", + " continue\n", "\n", - " tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n", - " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", + " # Generate sheet title from filename\n", + " sheet_title = re.sub(r'(\\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()\n", "\n", - " if sheet_title in existing_sheets:\n", - " if overwrite_sheets:\n", - " st = ss.worksheet(sheet_title)\n", - " else:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])\n", - " continue\n", + " # Skip execution if dry run mode\n", + " if dry_run:\n", + " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Dry run mode')\n", + " continue\n", + "\n", + " # Check if sheet already exists\n", + " if sheet_title in existing_sheets and not overwrite_sheets:\n", + " log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Sheet exists (set overwrite_sheets=True)')\n", + " continue\n", "\n", - " if dry_run:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])\n", - " continue\n", + " # Execute query and upload to Sheets\n", + " try:\n", + " # Run query with billing limit\n", + " job_config = bigquery.QueryJobConfig()\n", + " if maximum_tb_billed:\n", + " job_config.maximum_bytes_billed = int(maximum_tb_billed * 1024**4)\n", "\n", - " # Skip actual execution if no spreadsheet connection\n", - " if ss is None:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'No spreadsheet connection'])\n", - " continue\n", + " query_response = client.query(query, job_config=job_config)\n", + " df = query_response.to_dataframe()\n", "\n", - " try:\n", - " if maximum_tb_billed:\n", - " response = client.query(\n", - " query,\n", - " job_config = bigquery.QueryJobConfig(\n", - " maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024\n", - " )\n", - " )\n", - " else:\n", - " response = client.query(query)\n", + " # Get or create sheet\n", + " if sheet_title in existing_sheets:\n", + " sheet = ss.worksheet(sheet_title)\n", + " else:\n", + " sheet = ss.add_worksheet(sheet_title, rows=1, cols=1)\n", + " existing_sheets.append(sheet_title)\n", "\n", - " df = response.to_dataframe()\n", - " if ('st' not in locals() or st.title != sheet_title):\n", - " st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)\n", - " set_with_dataframe(st, df, resize=False)\n", + " # Upload data\n", + " set_with_dataframe(sheet, df, resize=False)\n", "\n", - " tb_billed = response.total_bytes_billed/1024/1024/1024/1024\n", - " print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])\n", + " tb_billed = query_response.total_bytes_billed / 1024**4\n", + " log_result(filename, f'{tb_billed:.3f}', sheet_title, '✓ Uploaded')\n", "\n", - " except Exception as e:\n", - " print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\\n{e}'])\n", - " continue\n", + " except Exception as e:\n", + " log_result(filename, f'{tb_billed:.3f}', None, f'Query error: {str(e)[:100]}...')\n", "\n", - " else:\n", - " print_logs_table([filename, None, None, 'Filename mismatch'])" + " print(f\"\\n✓ Processed {len(queries_processed_log)} queries\")" ] } ], diff --git a/sql/util/bq_writer.py b/sql/util/bq_writer.py index 3355dcfcb94..a72b6784ee0 100644 --- a/sql/util/bq_writer.py +++ b/sql/util/bq_writer.py @@ -18,7 +18,6 @@ def write_to_bq(df, table_id, schema, write_disposition="WRITE_APPEND"): client = bigquery.Client() job_config = bigquery.LoadJobConfig( - source_format=bigquery.SourceFormat.CSV, write_disposition=write_disposition, schema=schema, ) diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py index 1ea1bb7f75f..8ad2fd1cba6 100644 --- a/sql/util/haveibeenpwned.py +++ b/sql/util/haveibeenpwned.py @@ -1,50 +1,33 @@ """ -This module retrieves data from the "haveibeenpwned" API and loads it into a BigQuery table. +Retrieves breach data from the Have I Been Pwned API and loads it into BigQuery. + """ import json -from datetime import datetime as DateTime +from datetime import datetime -import pandas +import pandas as pd import requests # pylint: disable=import-error -from bq_writer import write_to_bq, bigquery - - -# Retrieve data from the "haveibeenpwned" API -breaches = json.loads( - requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10).content -) -df = pandas.DataFrame(breaches) - -year = DateTime.now().year -df["date"] = DateTime(year, 6, 1).date() -df["Name"] = df["Name"].astype(str) -df["Title"] = df["Title"].astype(str) -df["Domain"] = df["Domain"].astype(str) -df["BreachDate"] = pandas.to_datetime( - df["BreachDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["AddedDate"] = pandas.to_datetime( - df["AddedDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["ModifiedDate"] = pandas.to_datetime( - df["ModifiedDate"], format="%Y-%m-%d", errors="coerce" -).dt.date -df["Description"] = df["Description"].astype(str) -df["LogoPath"] = df["LogoPath"].astype(str) -df["DataClasses"] = df["DataClasses"].apply(json.dumps) +from bq_writer import bigquery, write_to_bq -# Append to httparchive.almanac.breaches +# Fetch breach data from API +response = requests.get("https://haveibeenpwned.com/api/v2/breaches", timeout=10) +breaches = response.json() +df = pd.DataFrame(breaches) +# Convert date fields +df["BreachDate"] = pd.to_datetime(df["BreachDate"], errors="coerce") +df["AddedDate"] = pd.to_datetime(df["AddedDate"], errors="coerce") +df["ModifiedDate"] = pd.to_datetime(df["ModifiedDate"], errors="coerce") +# Define BigQuery schema schema = [ - bigquery.SchemaField("date", "DATE"), bigquery.SchemaField("Name", "STRING"), bigquery.SchemaField("Title", "STRING"), bigquery.SchemaField("Domain", "STRING"), bigquery.SchemaField("BreachDate", "DATE"), - bigquery.SchemaField("AddedDate", "DATE"), - bigquery.SchemaField("ModifiedDate", "DATE"), + bigquery.SchemaField("AddedDate", "TIMESTAMP"), + bigquery.SchemaField("ModifiedDate", "TIMESTAMP"), bigquery.SchemaField("PwnCount", "INTEGER"), bigquery.SchemaField("Description", "STRING"), bigquery.SchemaField("LogoPath", "STRING"), @@ -53,7 +36,13 @@ bigquery.SchemaField("IsSensitive", "BOOLEAN"), bigquery.SchemaField("IsRetired", "BOOLEAN"), bigquery.SchemaField("IsSpamList", "BOOLEAN"), - bigquery.SchemaField("DataClasses", "STRING"), + bigquery.SchemaField("IsMalware", "BOOLEAN"), + bigquery.SchemaField("IsSubscriptionFree", "BOOLEAN"), + bigquery.SchemaField("IsStealerLog", "BOOLEAN"), + bigquery.SchemaField("DataClasses", "STRING", mode="REPEATED"), + bigquery.SchemaField("Attribution", "STRING"), + bigquery.SchemaField("DisclosureUrl", "STRING"), ] -write_to_bq(df, "httparchive.almanac.breaches", schema) +# Write to BigQuery +write_to_bq(df, "httparchive.almanac.breaches", schema, write_disposition="WRITE_TRUNCATE") diff --git a/sql/util/whotracksme_trackers.py b/sql/util/whotracksme_trackers.py index ec68f922e17..ad78ecf38ea 100644 --- a/sql/util/whotracksme_trackers.py +++ b/sql/util/whotracksme_trackers.py @@ -18,7 +18,7 @@ TRACKERS_QUERY = """ SELECT - '2024-06-01' AS date, + '2025-07-01' AS date, categories.name as category, tracker, domain diff --git a/src/requirements.txt b/src/requirements.txt index 19c398b6671..18b2eb93de8 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -8,6 +8,7 @@ sqlfluff==3.4.2 # sql/util/* dependencies pandas==2.3.3 +pandas-gbq==0.29.2 google-cloud-bigquery==3.38.0 requests==2.32.5 db-dtypes==1.4.3 From c2566e672516801efb80aad319bc18fa3fbc55ff Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 20:52:40 +0200 Subject: [PATCH 12/13] Potential fix for code scanning alert no. 640: Unused import Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- sql/util/haveibeenpwned.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py index 8ad2fd1cba6..38d27b86f36 100644 --- a/sql/util/haveibeenpwned.py +++ b/sql/util/haveibeenpwned.py @@ -4,7 +4,6 @@ """ import json -from datetime import datetime import pandas as pd import requests # pylint: disable=import-error From 42da6adb51c26897ba17d07f304cbf81d57a8062 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 20 Oct 2025 20:54:36 +0200 Subject: [PATCH 13/13] Remove unused json import --- sql/util/haveibeenpwned.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/util/haveibeenpwned.py b/sql/util/haveibeenpwned.py index 38d27b86f36..269adf2dd6e 100644 --- a/sql/util/haveibeenpwned.py +++ b/sql/util/haveibeenpwned.py @@ -3,8 +3,6 @@ """ -import json - import pandas as pd import requests # pylint: disable=import-error from bq_writer import bigquery, write_to_bq