User:Elukey/Analytics/Hadoop

From Wikitech
Jump to navigation Jump to search

Things to remember when creating a cluster from scratch

  • Hadoop
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/oozie

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/oozie
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/hive
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/camus
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/refinery
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/camus/webrequest-00
sudo -u analytics hdfs dfs -mkdir /user/analytics
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/wmf
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequests_data_loss
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequests_faulty_hosts

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown oozie:oozie /user/oozie
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown hive:hive /user/hive
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/camus
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown yarn:hadoop /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics-privatedata-users /wmf/data/raw/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics-privatedata-users /wmf/data/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequests_faulty_hosts
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequests_data_loss
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequests

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /tmp
  • Hive
sudo -u hdfs kerberos-run-command hdfs hive
  create database wmf_raw
  create database event
  create database wmf

Then create the tables contained in refinery hive:

REMEMBER: change the target hdfs from prod to test in every script before executing them.

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/hive/warehouse/wmf_raw.db
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/hive/warehouse/wmf.db

sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_raw_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_sequence_stats_hourly_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_sequence_stats_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_subset_table.hql --database wmf (?)
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_table.hql --database wmf
  • Refinery

Deploy the refinery to HDFS as usual, no changes or cherry picks

# after git clone refinery on a host, like an-tool1006
cd refinery
git fat init
git fat pull
cherry pick https://gerrit.wikimedia.org/r/#/c/analytics/refinery/+/491791/
cd oozie
# replace analytics-alerts@ with your email
for filename in `grep -l -r analytics-alerts | cut -f1 -d " "`; do sed -e 's/analytics-alerts@/ltoscano@/g' -i $filename; done
cd ..
hdfs dfs -copyFromLocal oozie /user/elukey/oozie

# Note:
# - check in bundle_test.properties that the refinery hive jar referenced is on hdfs
sudo -u analytics kerberos-run-command analytics oozie job \
-Duser=analytics \
-Dstart_time=2020-04-01T11:00Z \
-Derror_incomplete_data_threshold=100 \
-Dwarning_incomplete_data_threshold=100 \
-Derror_data_loss_threshold=100 \
-Dwarning_data_loss_threshold=100 \
-Dqueue_name=production \
-Doozie_directory=hdfs://analytics-test-hadoop/user/elukey/oozie \
-Drefinery_directory=hdfs://analytics-test-hadoop$(sudo -u analytics kerberos-run-command analytics hdfs dfs -ls -d /wmf/refinery/2020* | tail -n 1 | awk '{print $NF}') \
-oozie $OOZIE_URL -run -config /home/elukey/refinery/oozie/webrequest/load/bundle_test.properties