Data Platform/Systems/ua-parser/2019-09-18 Update
Appearance
(Redirected from Analytics/Systems/ua-parser/2019-09-18 Update)
This page documents the data quality checks for the 2019-09-18 ua-parser update.
Comparison-table creation
This should have been done using sampled data as documented in the procedure
-- In hive
USE joal;
ADD JAR hdfs:///user/joal/jars/refinery-hive-0.0.100-SNAPSHOT.jar;
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF';
CREATE TABLE ua_check_2019_09_09 STORED AS parquet AS
SELECT
user_agent,
user_agent_map AS user_agent_map_original,
ua_parser(user_agent) AS user_agent_map_new,
COUNT(1) AS requests
FROM wmf.webrequest
WHERE year = 2019 and month = 9 and day = 9
GROUP BY
user_agent,
user_agent_map,
ua_parser(user_agent);
Analysis
// In spark2-shell
spark.sql("use joal")
spark.table("ua_check_2019_09_09").cache()
/*********************************
Global analyses
**********************************/
spark.sql("""
SELECT
count(distinct user_agent) as distinct_user_agent,
count(distinct user_agent_map_original) as distinct_user_agent_map_original,
count(distinct user_agent_map_new) as distinct_user_agent_map_new
FROM ua_check_2019_09_09
""").show()
/*
+-------------------+--------------------------------+---------------------------+
|distinct_user_agent|distinct_user_agent_map_original|distinct_user_agent_map_new|
+-------------------+--------------------------------+---------------------------+
| 4695039| 662128| 748685|
+-------------------+--------------------------------+---------------------------+
More new distinct user-agent - probably means better details :)
*/
spark.sql("""
SELECT
-- Need to cast to string as map is not natually sortable for group-by
(CAST(user_agent_map_original AS string) = CAST(user_agent_map_new AS string)) as same_original_new,
sum(requests) as requests
FROM ua_check_2019_09_09
GROUP BY (CAST(user_agent_map_original AS string) = CAST(user_agent_map_new AS string))
""").show()
/*
+-----------------+----------+
|same_original_new| requests|
+-----------------+----------+
| false|2456715498|
| true|8279792838|
+-----------------+----------+
~22% of requests have changed from original to new (this seems like a lot!)
*/
/*********************************
By value-type (map key) analyses
**********************************/
val mapValues = Set("browser_family", "os_major", "wmf_app_version", "browser_major", "os_minor", "os_family", "device_family")
/*********************************
Check differences by value-type
**********************************/
mapValues.foreach( v => {
spark.sql(s"""
SELECT
(user_agent_map_original['$v'] = user_agent_map_new['$v']) as same_old_new_$v,
sum(requests) as requests
FROM ua_check_2019_09_09
GROUP BY (user_agent_map_original['$v'] = user_agent_map_new['$v'])
""").show()})
/*
+----------------------+-----------+
|same_old_new_os_family| requests|
+----------------------+-----------+
| false| 8267542|
| true|10728240794|
+----------------------+-----------+
0.08% of os_family have changed - Neglectable :)
+---------------------+----------+
|same_old_new_os_major| requests|
+---------------------+----------+
| false|1245835820|
| true|9490672516|
+---------------------+----------+
11.60% of os_major have changed - Big!
+---------------------+-----------+
|same_old_new_os_minor| requests|
+---------------------+-----------+
| false| 203917573|
| true|10532590763|
+---------------------+-----------+
1.90% of os_minor have changed - small
+--------------------------+-----------+
|same_old_new_browser_major| requests|
+--------------------------+-----------+
| false| 183745423|
| true|10552762913|
+--------------------------+-----------+
1.71% of browser_major have changed - small
+---------------------------+-----------+
|same_old_new_browser_family| requests|
+---------------------------+-----------+
| false| 220319367|
| true|10516188969|
+---------------------------+-----------+
2.05% of browser_family of changed - small
+--------------------------+----------+
|same_old_new_device_family| requests|
+--------------------------+----------+
| false|1543944946|
| true|9192563390|
+--------------------------+----------+
14.38% of device_family have changed - Big!
+----------------------------+-----------+
|same_old_new_wmf_app_version| requests|
+----------------------------+-----------+
| true|10736508336|
+----------------------------+-----------+
No wmf_app_version change (expected :)
Summary: Most of the 22% difference seems to be coming from os_major and device_family
*/
/*********************************
Check differences without os_major and device_family
**********************************/
spark.sql(s"""
SELECT
(
(user_agent_map_original['browser_family'] = user_agent_map_new['browser_family'])
AND (user_agent_map_original['wmf_app_version'] = user_agent_map_new['wmf_app_version'])
AND (user_agent_map_original['browser_major'] = user_agent_map_new['browser_major'])
AND (user_agent_map_original['os_minor'] = user_agent_map_new['os_minor'])
AND (user_agent_map_original['os_family'] = user_agent_map_new['os_family'])
) AS same_old_new_no_os_major_device_family,
sum(requests) AS requests
FROM ua_check_2019_09_09
GROUP BY (
(user_agent_map_original['browser_family'] = user_agent_map_new['browser_family'])
AND (user_agent_map_original['wmf_app_version'] = user_agent_map_new['wmf_app_version'])
AND (user_agent_map_original['browser_major'] = user_agent_map_new['browser_major'])
AND (user_agent_map_original['os_minor'] = user_agent_map_new['os_minor'])
AND (user_agent_map_original['os_family'] = user_agent_map_new['os_family'])
)
""").show()
/*
+--------------------------------------+-----------+
|same_old_new_no_os_major_device_family| requests|
+--------------------------------------+-----------+
| false| 432414989|
| true|10304093347|
+--------------------------------------+-----------+
Without os_major and device_family, difference is down to 4.03%
*/
/*********************************
Check main different values
**********************************/
mapValues.foreach( v => {
spark.sql(s"""
SELECT
user_agent_map_original['$v'] as ${v}_original,
user_agent_map_new['$v'] as ${v}_new,
sum(requests) as requests
FROM ua_check_2019_09_09
WHERE user_agent_map_original['$v'] != user_agent_map_new['$v']
GROUP BY user_agent_map_original['$v'], user_agent_map_new['$v']
ORDER BY requests DESC
LIMIT 10
""").show(10, false)})
/*
+------------------+-------------+--------+
|os_family_original|os_family_new|requests|
+------------------+-------------+--------+
|Windows 98 |Windows |3145438 |
|Windows CE |Windows |2577240 |
|Other |Linux |1463818 |
|Other |iOS |514909 |
|Windows XP |Windows |254268 |
|Other |Red Hat |179065 |
|Windows NT 4.0 |Windows |43636 |
|Windows 7 |Windows |42063 |
|Linux |Sailfish |25976 |
|Windows 3.1 |Windows |9794 |
+------------------+-------------+--------+
Bug corrections for original OS+version to OS only, plus better classification
+-----------------+------------+---------+
|os_major_original|os_major_new|requests |
+-----------------+------------+---------+
|- |9 |972888728|
|8.1 |8 |199160705|
|- |12 |51663316 |
|- |10 |11292025 |
|- |98 |3145564 |
|- |CE |2577240 |
|- |NT |1153315 |
|8 |9 |792991 |
|- |11 |777683 |
|- |28 |648825 |
+-----------------+------------+---------+
Better classification of new versions and bug correction of major+minor to major only
+-----------------+------------+---------+
|os_minor_original|os_minor_new|requests |
+-----------------+------------+---------+
|- |1 |199615874|
|5 |3 |1184992 |
|- |4 |778230 |
|90 |- |687275 |
|- |0 |616120 |
|- |3 |309954 |
|- |2 |259486 |
|00 |- |258925 |
|80 |- |76216 |
|3 |2 |62519 |
+-----------------+------------+---------+
Bug correction (same as major+minor) and better classification
+----------------------+-----------------+---------+
|browser_major_original|browser_major_new|requests |
+----------------------+-----------------+---------+
|12 |81 |104305897|
|71 |10 |12537625 |
|10 |68 |6211297 |
|- |1 |5701667 |
|9 |42 |4977414 |
|76 |10 |4816835 |
|11 |81 |3770998 |
|12 |80 |3681879 |
|12 |- |3554663 |
|- |2 |2660792 |
+----------------------+-----------------+---------+
Better classification
+--------------------------+---------------------+---------+
|browser_family_original |browser_family_new |requests |
+--------------------------+---------------------+---------+
|Mobile Safari |Google |140953344|
|Chrome Mobile |Chrome Mobile WebView|28167135 |
|Chrome Mobile |MiuiBrowser |16879392 |
|Other |Go-http-client |8360628 |
|Chrome Mobile WebView |Google |6677124 |
|Chrome |Chrome Mobile WebView|4375939 |
|Mobile Safari UI/WKWebView|Facebook |3965980 |
|Chrome |Edge |2703141 |
|Other |PycURL |1782816 |
|Mobile Safari |LINE |1469780 |
+--------------------------+---------------------+---------+
Unexpected change: Mobile Safari -> Google, and better classification
+----------------------+-------------------+--------+
|device_family_original|device_family_new |requests|
+----------------------+-------------------+--------+
|Generic Smartphone |Phone |71528254|
|Generic Smartphone |Samsung SM-G950F |26377970|
|Generic Smartphone |Huawei ANE-LX1 |20188659|
|Generic Smartphone |Samsung SM-G930F |18019655|
|Generic Smartphone |Samsung SM-G960F |16928752|
|Generic Smartphone |Samsung SM-A520F |14145243|
|Generic Smartphone |rv:68.0 |14129479|
|Generic Smartphone |XiaoMi Redmi Note 7|13733049|
|Generic Smartphone |XiaoMi Redmi Note 4|13549923|
|Generic Smartphone |XiaoMi Redmi Note 5|12220607|
+----------------------+-------------------+--------+
Better classification
+------------------------+-------------------+--------+
|wmf_app_version_original|wmf_app_version_new|requests|
+------------------------+-------------------+--------+
+------------------------+-------------------+--------+
Summary: Big impact of new version classification for broser_major, and a lot of better classification to device_family
*/
/*********************************
Checking Mobile Safari/Google browser_family swap
**********************************/
spark.sql(s"""
SELECT
user_agent,
user_agent_map_original['browser_family'] AS original_browser_family,
user_agent_map_new['browser_family'] AS new_browser_family,
sum(requests) AS requests
FROM ua_check_2019_09_09
WHERE user_agent_map_original['browser_family'] != user_agent_map_new['browser_family']
AND user_agent_map_new['browser_family'] = 'Google'
GROUP BY
user_agent,
user_agent_map_original['browser_family'],
user_agent_map_new['browser_family']
ORDER BY requests DESC
LIMIT 10
""").show(10, false)
/*
+---------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------+--------+
|user_agent |original_browser_family|new_browser_family|requests|
+---------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------+--------+
|Mozilla/5.0 (iPhone; CPU iPhone OS 12_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1|Mobile Safari |Google |75292632|
|Mozilla/5.0 (iPhone; CPU iPhone OS 12_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1|Mobile Safari |Google |9884363 |
|Mozilla/5.0 (iPad; CPU OS 12_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1 |Mobile Safari |Google |8961205 |
|Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) GSA/42.0.183854831 Mobile/13G36 Safari/601.1 |Mobile Safari |Google |3720236 |
|Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1|Mobile Safari |Google |2605360 |
|Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) GSA/68.0.234683655 Mobile/14G60 Safari/602.1 |Mobile Safari |Google |2413807 |
|Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1|Mobile Safari |Google |2021833 |
|Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/604.1|Mobile Safari |Google |1666870 |
|Mozilla/5.0 (iPhone; CPU iPhone OS 12_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/80.0.262003652 Mobile/16G77 Safari/604.1 |Mobile Safari |Google |1474741 |
|Mozilla/5.0 (iPad; CPU OS 12_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/81.0.264749124 Mobile/15E148 Safari/605.1 |Mobile Safari |Google |1445142 |
+---------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+------------------+--------+
The user-agent contains GSA (Google Search App, the IOs google-app) and Safari. New regexes now correctly flag those as Google.
*/