Analytics/Data Lake/Traffic/ApiAction

From Wikitech
Jump to navigation Jump to search

wmf_raw.ApiAction

The wmf_raw.ApiAction table is 'raw' api.php request data. The data is produced by the ApiMain::logRequest() method in MediaWiki to the "ApiAction" debug logging channel. This logging channel is encoded as binary Avro messages and delivered to Kafka. An Oozie job consumes the Avro data from the "mediawiki_ApiAction" channel and stores it in HDFS.

Avro schema

{
  "type": "record",
  "name": "ApiAction",
  "namespace": "org.wikimedia.analytics.schemas",
  "doc": "A single request to the MediaWiki Action API (api.php)",
  "fields": [
    {
      "name": "ts",
      "doc": "The timestamp, in unix time, that the request was made",
      "type": "int",
      "default": 0
    },
    {
      "name": "ip",
      "doc": "The requesting IP address (either ipv4 or ipv6) in string notation",
      "type": "string",
      "default": ""
    },
    {
      "name": "userAgent",
      "doc": "The HTTP User-Agent header prefixed with the Api-user-agent header if present",
      "type": "string",
      "default": ""
    },
    {
      "name": "wiki",
      "doc": "The wiki making this request, such as dewiki or enwiktionary",
      "type": "string",
      "default": ""
    },
    {
      "name": "timeSpentBackend",
      "doc": "Elapsed milliseconds to process request as measured by MediaWiki",
      "type": "int",
      "default": -1
    },
    {
      "name": "hadError",
      "doc": "Were errors encountered when processing this request?",
      "type": "boolean",
      "default": false
    },
    {
      "name": "errorCodes",
      "doc": "List of error codes for any errors encountered",
      "type": {"type": "array", "items": "string"},
      "default": []
    },
    {
      "name": "params",
      "doc": "Request parameters",
      "type": {"type": "map", "values": "string"},
      "default": {}
    }
  ]
}

wmf.action_*

The raw data from wmf_raw.ApiAction is used to compute various dimensional rollup tables for easier analysis.

-- Create tables for Action API stats
--
-- Usage:
--     hive -f create-action-tables.sql --database wmf

CREATE TABLE IF NOT EXISTS action_ua_hourly (
  userAgent STRING COMMENT 'Raw user-agent',
  wiki      STRING COMMENT 'Target wiki (e.g. enwiki)',
  ipClass   STRING COMMENT 'IP based origin, can be wikimedia, wikimedia_labs or internet',
  viewCount BIGINT COMMENT 'Number of requests'
)
COMMENT 'Hourly summary of Action API requests bucketed by user-agent and wiki'
PARTITIONED BY (
  year      INT COMMENT 'Unpadded year of request',
  month     INT COMMENT 'Unpadded month of request',
  day       INT COMMENT 'Unpadded day of request',
  hour      INT COMMENT 'Unpadded hour of request'
)
STORED AS PARQUET;


CREATE EXTERNAL TABLE IF NOT EXISTS action_action_hourly (
  action    STRING COMMENT 'Action parameter value',
  wiki      STRING COMMENT 'Target wiki (e.g. enwiki)',
  ipClass   STRING COMMENT 'IP based origin, can be wikimedia, wikimedia_labs or internet',
  viewCount BIGINT COMMENT 'Number of requests'
)
COMMENT 'Hourly summary of Action API requests bucketed by action and wiki'
PARTITIONED BY (
  year      INT COMMENT 'Unpadded year of request',
  month     INT COMMENT 'Unpadded month of request',
  day       INT COMMENT 'Unpadded day of request',
  hour      INT COMMENT 'Unpadded hour of request'
)
STORED AS PARQUET;


CREATE EXTERNAL TABLE IF NOT EXISTS action_param_hourly (
  action    STRING COMMENT 'Action parameter value',
  param     STRING COMMENT 'Parameter name, can be prop, list, meta, generator, etc',
  value     STRING COMMENT 'Parameter value',
  wiki      STRING COMMENT 'Target wiki (e.g. enwiki)',
  ipClass   STRING COMMENT 'IP based origin, can be wikimedia, wikimedia_labs or internet',
  viewCount BIGINT COMMENT 'Number of requests'
)
COMMENT 'Hourly summary of Action API requests bucketed by action, parameter, value and wiki'
PARTITIONED BY (
  year      INT COMMENT 'Unpadded year of request',
  month     INT COMMENT 'Unpadded month of request',
  day       INT COMMENT 'Unpadded day of request',
  hour      INT COMMENT 'Unpadded hour of request'
)
STORED AS PARQUET;

-- NOTE: there are many params we do not want to count distinct values of
-- at all (eg maxlag, smaxage, maxage, requestid, origin, centralauthtoken,
-- titles, pageids). Rather than trying to make an extensive blacklist and
-- potentially allow new parameters to slip through which have high
-- cardinality or sensitive information, the ETL process will use a whitelist
-- approach to count params that have been deemed to be useful.
--
-- The initial whitelist is (query, prop), (query, list), (query, meta),
-- (flow, module), (*, generator). The prop, list and meta parameters will
-- additionally be split on '|' with each component counted separately.