Created
December 15, 2014 16:24
-
-
Save Ironholds/96558613fe38dd4d1961 to your computer and use it in GitHub Desktop.
query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET hive.exec.compress.output=true; | |
SET whitelisted_mediawiki_projects = 'commons', 'meta', 'incubator', 'species'; | |
CREATE TABLE ironholds.pageviews_sample_test(qualifier STRING, count_views INT); | |
INSERT OVERWRITE TABLE ironholds.pageviews_sample_test | |
SELECT | |
CONCAT(sub1.language_and_site, sub1.project_suffix) qualifier, | |
COUNT(*) count_views | |
FROM ( | |
SELECT | |
regexp_extract(uri_host, '^([A-Za-z0-9-]+(\\.(zero|m))?)\\.[a-z]*\\.org$') language_and_site, | |
CASE regexp_extract(uri_host, '\\.(wik(ipedia|ibooks|tionary|imediafoundation|imedia|inews|iquote|isource|iversity|ivoyage|idata)|mediawiki)\\.org$') | |
WHEN 'wikipedia' THEN '' | |
WHEN 'wikibooks' THEN '.b' | |
WHEN 'wiktionary' THEN '.d' | |
WHEN 'wikimediafoundation' THEN '.f' | |
WHEN 'wikimedia' THEN '.m' | |
WHEN 'wikinews' THEN '.n' | |
WHEN 'wikiquote' THEN '.q' | |
WHEN 'wikisource' THEN '.s' | |
WHEN 'wikiversity' THEN '.v' | |
WHEN 'wikivoyage' THEN '.voy' | |
WHEN 'mediawiki' THEN '.w' | |
WHEN 'wikidata' THEN '.wd' | |
ELSE NULL END project_suffix | |
FROM wmf_raw.webrequest | |
WHERE webrequest_source IN ('text', 'mobile') | |
AND year=2014 | |
AND month=12 | |
AND day=7 | |
AND hour=12 | |
AND (content_type IN('text/html\; charset=iso-8859-1','text/html\; charset=ISO-8859-1','text/html','text/html\; charset=utf-8','text/html\; charset=UTF-8') AND uri_path NOT LIKE ('api.php')) OR (content_type = 'application/json' AND user_agent LIKE('WikipediaApp') AND uri_path LIKE('sections=0')) | |
AND uri_path NOT RLIKE('(BannerRandom|CentralAutoLogin|ZeroRatedMobileAccess|UserLogin|MobileEditor)') | |
AND uri_path RLIKE('(/zh(-(tw|cn|hant|mo|my|hans|hk|sg))?/|/sr(-(ec|el))?/|/wiki(/|\\?(cur|old)id=)|/w/|/\\?title=)') | |
AND http_status = '200' | |
AND uri_path NOT LIKE('undefined') | |
AND uri_path NOT LIKE('Undefined') | |
) sub1 | |
WHERE sub1.language_and_site != '' | |
AND sub1.project_suffix IS NOT NULL | |
AND ( | |
sub1.project_suffix != '.m' | |
OR SPLIT(sub1.language_and_site, '\\.')[0] IN ( | |
${hiveconf:whitelisted_mediawiki_projects} | |
) | |
) | |
GROUP BY sub1.language_and_site, sub1.project_suffix | |
LIMIT 100000000; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment