- One hour of pageviews (${YEAR},${MONTH},${DAY},${HOUR})
- filtered from
- IPs having more than IP_UA_THRESHOLD different user agents
- (trying to remove mobile "many devices for a single IP" artifact)
- IPs having more than IP_REQ_THRESHOLD views
- (trying to remove u "many devices for a single IP" artifact)
- finger-printing Dimensions we took as defining a group are:
- project (not to be anonymized)
- language_variant (not to be anonymized)
- access method (not to be anonymized)
- agent type, (not to be anonymized)
- zero carrier,
- geocoding hierarchical data (continent, country, subdivision, city)
- user_agent_map
- Data to analyze
- Distinct IPs (count and list)
- Titles of page viewed
- Number of requests
|
CREATE TABLE pv_san.idents_buckets STORED AS PARQUET AS
WITH wrong_ips AS (
SELECT
client_ip AS w_ip,
COUNT(DISTINCT user_agent) AS user_agent_count,
COUNT(1) AS request_count
FROM wmf.webrequest
WHERE webrequest_source IN ('mobile', 'text')
AND year=${YEAR} AND month=${MONTH} AND day=${DAY} AND hour=${HOUR}
AND is_pageview
GROUP BY client_ip
HAVING (COUNT(DISTINCT user_agent) >= ${IP_UA_THRESHOLD}
OR COUNT(1) >= ${IP_REQ_THRESHOLD} )
)
SELECT
pageview_info['project'] as project,
pageview_info['language_variant'] as language,
agent_type,
access_method,
agent_type,
x_analytics_map['zero'] AS zero_carrier,
geocoded_data['continent'] AS continent,
geocoded_data['country'] AS country,
geocoded_data['subdivision'] AS subdivision,
geocoded_data['city'] AS city,
user_agent_map,
COUNT(DISTINCT client_ip) AS ips_count,
COLLECT_SET(client_ip) AS unique_ips,
COUNT(DISTINCT pageview_info['page_title']) AS pages_count,
COLLECT_SET(pageview_info['page_title']) AS unique_pages,
COUNT(1) AS request_count
FROM wmf.webrequest webrequest
LEFT OUTER JOIN wrong_ips ON (webrequest.client_ip = wrong_ips.w_ip)
WHERE wrong_ips.w_ip IS NULL and
AND webrequest_source IN ('mobile', 'text')
AND year=${YEAR} and month=${MONTH} and day=${DAY} and hour=${HOUR}
AND geocoded_data['continent'] not in ('Unknown', '--', '-')
AND geocoded_data['country'] not in ('Unknown', '--', '-')
AND geocoded_data['subdivision'] not in ('Unknown', '--', '-')
AND geocoded_data['city'] not in ('Unknown', '--', '-')
AND is_pageview
AND agent_type = 'user'
GROUP BY
pageview_info['project'],
pageview_info['language_variant'],
access_method,
agent_type,
x_analytics_map['zero'],
geocoded_data['continent'],
geocoded_data['country'],
geocoded_data['subdivision'],
geocoded_data['city'],
user_agent_map;
|