Data Platform/Systems/EventLogging

unencoded

https://en.wikipedia.org/beacon/event?%7B%22event%22%3A%7B%22version%22%3A1%2C%22action%22%3A%22abort%22...

https://en.wikipedia.org/beacon/event?
{
 "event": { "action": "abort", ... },
 "schema": "Edit",
 "revision": 1234,
 "webHost": "en.wikipedia.org",
 "wiki": "enwiki"
}

ADD JAR hdfs:///wmf/refinery/current/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION GetMediawikiTimestamp AS 'org.wikimedia.analytics.refinery.hive.GetMediawikiTimestampUDF';
SELECT GetMediaWikiTimestamp('2019-02-20T12:34:56Z') AS timestamp;
OK
timestamp
20190220123456

SELECT
  event.userID,
  count(*) as cnt
FROM
  event.MobileWikiAppEdit
WHERE
  year = 2017 AND month = 11 AND day = 20 AND hour = 19
GROUP BY event.userID
ORDER BY cnt DESC
LIMIT 10;

...
event.userid	cnt
NULL           1848
333333           87
222229           59
111113           29
111125           21
466534           17
433542           10
754324            7
121346            7
123452            6

SELECT
nav.event.origincountry,
srv.event.description,
PERCENTILE(nav.event.responsestart, 0.50) AS responsestart_p50,
PERCENTILE(nav.event.responsestart, 0.75) AS responsestart_p75,
COUNT(*) AS count
FROM event.navigationtiming AS nav
JOIN event.servertiming AS srv ON nav.event.pageviewtoken = srv.event.pageviewtoken
WHERE
nav.year = 2020 AND
srv.year = 2020 AND
nav.month = 1 AND
srv.month = 1 AND
nav.day = 28 AND
srv.day = 28 AND
nav.event.isoversample = false
GROUP BY nav.event.origincountry,srv.event.description
HAVING count > 1000;

select * from eventerror where event.schema like 'MobileWikiApp%' and year=2018 and month=11 and day=1 limit 10;

// spark2-shell

val query = """
SELECT
  event.userID,
  count(*) as cnt
FROM
  event.MobileWikiAppEdit
WHERE
  year = 2017 AND month = 11 AND day = 20 AND hour = 19
GROUP BY event.userID
ORDER BY cnt DESC
"""

val result = spark.sql(query)
result.limit(10).show()

...
+--------+----+
|  userID| cnt|
+--------+----+
|    null|1848|
|  333333|  87|
|  222229|  59|
|  111113|  29|
|  111125|  21|
|  466534|  17|
|  433542|  10|
|  754324|   7|
|  121346|   7|
|  123452|   6|
+--------+----+

# pyspark2

query = """
SELECT
  event.userID,
  count(*) as cnt
FROM
  event.MobileWikiAppEdit
WHERE
  year = 2017 AND month = 11 AND day = 20 AND hour = 19
GROUP BY event.userID
ORDER BY cnt DESC
"""

result = spark.sql(query)
result.limit(10).show()

...
+--------+----+
|  userID| cnt|
+--------+----+
|    null|1848|
|  333333|  87|
|  222229|  59|
|  111113|  29|
|  111125|  21|
|  466534|  17|
|  433542|  10|
|  754324|   7|
|  121346|   7|
|  123452|   6|
+--------+----+

# spark2R

query <- "
SELECT
  event.userID,
  count(*) as cnt
FROM
  event.MobileWikiAppEdit
WHERE
  year = 2017 AND month = 11 AND day = 20 AND hour = 19
GROUP BY event.userID
ORDER BY cnt DESC
"

result <- collect(sql(query))
head(result,10)

...
     userID  cnt
1        NA 1848
2    333333   87
3    222229   59
4    111113   29
5    111125   21
6    466534   17
7    433542   10
8    754324    7
9    121346    7
10   123452    6

mobilewebuiclicktracking_10742159_15423246 
Edit_13457736_15423246
MobileWikiAppToCInteraction_10375484_15423246
MediaViewer_10867062_15423246
MobileWikiAppToCInteraction_10375484_15423246
pagecontentsavecomplete_5588433_15423246
PageContentSaveComplete_5588433
PageCreation_7481635
PageCreation_7481635_15423246
PageDeletion_7481655
PageDeletion_7481655_15423246

select *  from Some_tbl where (cast(uuid as string) )='ed663031e61452018531f45b4b5502cb';

/mnt/hdfs/wmf/data/raw/eventlogging/eventlogging_<schema>/hourly/<year>/<month>/<day>/<hour>
ADD JAR file:///usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;

-- Make sure you don't create tables in the default Hive database.
USE otto;

-- Create a table with a single string field
CREATE EXTERNAL TABLE `CentralNoticeBannerHistory` (
  `json_string` string
)
PARTITIONED BY (
  year int,
  month int,
  day int,
  hour int
)
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 
  '/wmf/data/raw/eventlogging/eventlogging_CentralNoticeBannerHistory';

-- Add a partition
ALTER TABLE CentralNoticeBannerHistory
ADD PARTITION (year=2015, month=9, day=17, hour=16)
LOCATION '/wmf/data/raw/eventlogging/eventlogging_CentralNoticeBannerHistory/hourly/2015/09/17/16';

-- Parse the single string field as JSON and select a nested key out of it
SELECT get_json_object(json_string, '$.event.l.b') as banner_name
FROM CentralNoticeBannerHistory
WHERE year=2015;

import json
data = sc.sequenceFile("/wmf/data/raw/eventlogging/eventlogging_CentralNoticeBannerHistory/hourly/2015/09/17/07")
records = data.map(lambda x: json.loads(x[1]))
records.map(lambda x: (x['event']['l'][0]['b'], 1)).countByKey()
Out[33]: defaultdict(<class 'int'>, {'WMES_General_Assembly': 5})

# Load the JSON string values out of the compressed sequence file.
# Note that this uses * globs to expand to all data in 2016.
data = sc.sequenceFile(
    "/wmf/data/raw/eventlogging/eventlogging_MobileWikiAppFindInPage/hourly/2016/*/*/*"
).map(lambda x: x[1])

# parse the JSON strings into a DataFrame
json_data = sqlCtx.jsonRDD(data) # replace with sqlCtx.read.json(data) for pyspark 2
# Register this DataFrame as a temp table so we can use SparkSQL.
json_data.registerTempTable("MobileWikiAppFindInPage")

top_k_page_ids = sqlCtx.sql(
"""SELECT event.pageID, count(*) AS cnt
    FROM MobileWikiAppFindInPage
    GROUP BY event.pageID
    ORDER BY cnt DESC
    LIMIT 10"""
)
for r in top_k_page_ids.collect():
    print "%s: %s" % (r.pageID, r.cnt)

// Load the JSON string values out of the compressed sequence file
// and parse them as a DataFrame.

val rawDataPath = "/wmf/data/raw/eventlogging/eventlogging_Edit/hourly/2015/10/21/16"

val edits = spark.read.json(
    spark.createDataset[String](
        spark.sparkContext.sequenceFile[Long, String](rawDataPath).map(_._2)
    )
)
// Register this DataFrame as a temp table so we can use SparkSQL.
edits.registerTempTable("edits")

// SELECT top 10 edited wikis
val top_k_edits = sqlContext.sql(
    """SELECT wiki, count(*) AS cnt
    FROM edits
    GROUP BY wiki
    ORDER BY cnt DESC
    LIMIT 10"""
)
// Print them out
top_k_edits.foreach(println)

# Uses kafkacat CLI to print window ($1)
# seconds of data from $topic ($2)
function kafka_timed_subscribe {
    timeout $1 kafkacat -C -b kafka-jumbo1001 -t $2
}

# Prints the top K most frequently
# occurring values from stdin.
function top_k {
    sort        |
    uniq -c     |
    sort -nr    |
    head -n $1
}

while true; do
    date; echo '------------------------------' 
    # Subscribe to eventlogging_Edit topic for 5 seconds
    kafka_timed_subscribe 5 eventlogging_Edit |
    # Filter for the "wiki" field 
    jq .wiki |
    # Count the top 10 wikis that had the most edits
    top_k 10
    echo ''
done

/srv/log/eventlogging/systemd

eventlogging_EventError

eventlogging_processor-client-side-<some>.log 

Unable to validate: ?{
  "event": {
    "pagename": "Recentchanges",
    "namespace": null,
    "invert": false,
    "associated": false,
    "hideminor": false,
    "hidebots": true,
    "hideanons": false,
    "hideliu": false,
    "hidepatrolled": false,
    "hidemyself": false,
    "hidecategorization": true,
    "tagfilter": null
  },
  "schema": "ChangesListFilters",
  "revision": 15876023,
  "clientValidated": false,
  "wiki": "nowikimedia",
  "webHost": "no.wikimedia.org",
  "userAgent": "Apple-PubSub/65.28"
}; cp1066.eqiad.wmnet 42402900 2016-09-26T07:01:42 -

$wgEventLoggingServiceUri

Date from	Date until	Task	Details
2020-06-18T20:00:00Z	2019-06-19T22:00:00Z	Task T249261	While attempting the first migration of legacy EventLogging steams to EventGate, Otto misconfigured the EventLogging extension's `$wgEventLoggingServiceUri` for non group0 wikis, effectively causing SearchSatisfaction events to be disable on all non group0 wikis.
2019-09-23	2019-09-29	Task T233718	Many events emitted by MediaWiki are missing in Hive refined event database tables, including events from mediawiki_revision_create, mediawiki_page_create, etc. This was caused by a problem when importing data from Kafka via Camus, but at the time was only known to affect mediawiki_api_request and mediawiki_cirrussearch_request. Data for other mediawiki_* tables was not backfilled, and the raw data has since been deleted.
2017-11	2017-11	Task T179625	Canonical EventLogging data (parsed and validated and stored in Kafka) did not match EventCapsule schema. This was fixed, and data was transformed before insertion into MySQL for backwards compatibility. This helped standardize all event data so that it could be refined and made available in Hive.
2017-07-10	2017-07-12	task T170486	Some data was not inserted in MySQL, but was backfilled for all schemas but page-create. During the backfill, bot events were also accidentally backfilled, resulting in extra data during this time.
2017-05-24	onwards	task T67508	Do not accept data from bots on eventlogging unless bot user agent matches "MediaWiki".
2017-03-29	onwards	task T153207	Change userAgent field in event capsule
2019-03-19 (14 to 22 hours)		task T218831	Eventlogging mysql consumer was restarting for several hours in which it was not able to enter any data on database
2019-04-01		Task: T219842	Kafka Jumbo outage since 22:00 to midnite. Data loss on those hours
2019-09-12		https://phabricator.wikimedia.org/T228557	Third party domain data is not getting refined (so sites like w.upupming.site that run clones of our code do not send us their requests)

Name	Client platforms affected	Analytics intake systems affected	Notes
uBlock Origin	Web (desktop + mobile)	EventLogging, MEP	EasyPrivacy enabled by default
Brave (web browser)	Web (desktop + mobile)	MEP	Blocks requests to `intake-analytics.wikimedia.org` when using standard (default) privacy settings

For users

Schemas

Creating a schema

Send events

Client-side events

Accessing data

Privacy

Access

Hadoop & Hive

Notes on data in Hive

Hive

Errors for schemas

Spark

Spark 2 Scala SQL & Hive:

Spark 2 Python SQL & Hive:

Spark 2 R SQL & Hive:

Hadoop. Archived Data

Hadoop Raw Data

Hive

Spark

Kafka

Publishing data

Verify received events

User agent sanitization

Data retention and purging

Retiring a schema

Operational support

Tier 2 support

Outages

Alarms

Contact

For developers

Codebase

Architecture

Performance

Size limitation

Monitoring

Testing

How do I ...?

Administration. On call

Data Quality Issues

Changes and Known Problems with Dataset

Incidents

Limits of the eventlogging replication script

Ad blockers

See also

Notes