# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'
# parse hostnames from links
# includes extraction for web.archive.org links
parse_host = """
PARSE_URL(REGEXP_REPLACE(LOWER({}),'^https?://web.archive.org/web/[^/]+/',''),'HOST')
"""
# count of top 20 hostnames in WP:M pages with external links
top_hosts_wpm_query = """
SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count
FROM ryanmax.population_externallinks
WHERE el_from IN
(SELECT DISTINCT page_id AS el_from
FROM ryanmax.population_wpm_pages_with_extlinks
)
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm = spark.sql(
top_hosts_wpm_query.format(
parse_host.format('el_to')
))
top_hosts_wpm.toPandas()
host | count | |
---|---|---|
0 | www.ncbi.nlm.nih.gov | 238688 |
1 | doi.org | 183761 |
2 | www.worldcat.org | 25434 |
3 | books.google.com | 24491 |
4 | www.google.com | 11074 |
5 | apps.who.int | 12816 |
6 | www.icd9data.com | 8693 |
7 | www.wikidata.org | 8069 |
8 | amigo.geneontology.org | 6909 |
9 | www.nlm.nih.gov | 6420 |
10 | www.cdc.gov | 6285 |
11 | www.jstor.org | 4302 |
12 | scholar.google.com | 2850 |
13 | www.who.int | 5350 |
14 | www.nytimes.com | 4545 |
15 | www.omim.org | 4415 |
16 | adsabs.harvard.edu | 4255 |
17 | www.emedicine.com | 3477 |
18 | www.diseasesdatabase.com | 3105 |
19 | id.loc.gov | 3008 |
# count of top 20 hostnames in WP:M pages with external links
top_hosts_w_query = """
SELECT {} AS host, COUNT(DISTINCT el_from, el_to) AS count
FROM ryanmax.population_externallinks
WHERE el_from IN
(SELECT DISTINCT page_id AS el_from
FROM ryanmax.population_w_pages_with_extlinks
)
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_w = spark.sql(
top_hosts_w_query.format(
parse_host.format('el_to')
))
top_hosts_w.toPandas()
host | count | |
---|---|---|
0 | tools.wmflabs.org | 2118863 |
1 | www.google.com | 1763467 |
2 | doi.org | 1199818 |
3 | www.ncbi.nlm.nih.gov | 1122007 |
4 | www.worldcat.org | 1061459 |
5 | books.google.com | 1231622 |
6 | scholar.google.com | 437109 |
7 | www.wikidata.org | 805309 |
8 | www.jstor.org | 548764 |
9 | ssd.jpl.nasa.gov | 592901 |
10 | minorplanetcenter.net | 556758 |
11 | viaf.org | 535724 |
12 | id.loc.gov | 450557 |
13 | www.imdb.com | 418623 |
14 | isni.org | 413584 |
15 | www.gbif.org | 351989 |
16 | www.nytimes.com | 343951 |
17 | www.youtube.com | 330816 |
18 | eol.org | 304728 |
19 | www.webcitation.org | 301718 |
# count of web.archive.org links in WP:M pages with external links
archive_org_wpm_query = """
SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count
FROM ryanmax.population_externallinks
WHERE el_from IN
(SELECT DISTINCT page_id AS el_from
FROM ryanmax.population_wpm_pages_with_extlinks
)
AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'
AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'
GROUP BY host
"""
archive_org_wpm = spark.sql(archive_org_wpm_query)
archive_org_wpm.toPandas()
host | count | |
---|---|---|
0 | web.archive.org | 50390 |
# count of web.archive.org links in W pages with external links
archive_org_w_query = """
SELECT PARSE_URL(LOWER(el_to),'HOST') AS host, COUNT(DISTINCT el_from, el_to) AS count
FROM ryanmax.population_externallinks
WHERE el_from IN
(SELECT DISTINCT page_id AS el_from
FROM ryanmax.population_w_pages_with_extlinks
)
AND LOWER(el_to) RLIKE '^https?://web.archive.org/web/'
AND PARSE_URL(LOWER(el_to),'HOST') = 'web.archive.org'
GROUP BY host
"""
archive_org_w = spark.sql(archive_org_w_query)
archive_org_w.toPandas()
host | count | |
---|---|---|
0 | web.archive.org | 4416436 |
# Total event count for top 20 hostnames
# limited to WP:M pages with external links
top_hosts_wpm_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
top_hosts_wpm_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_wpm_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.wikipedia.org | 1860686 | 58146.43750 |
1 | en.m.wikipedia.org | 627309 | 19603.40625 |
2 | www.ncbi.nlm.nih.gov | 86432 | 2701.00000 |
3 | doi.org | 74930 | 2341.56250 |
4 | books.google.com | 19729 | 616.53125 |
5 | www.drugs.com | 16639 | 519.96875 |
6 | www.who.int | 15105 | 472.03125 |
7 | translate.googleusercontent.com | 10775 | 336.71875 |
8 | theranos.com | 10660 | 333.12500 |
9 | www.cdc.gov | 10119 | 316.21875 |
10 | chemapps.stolaf.edu | 7613 | 237.90625 |
11 | apps.who.int | 6384 | 199.50000 |
12 | www.nlm.nih.gov | 6291 | 196.59375 |
13 | www.fda.gov | 5101 | 159.40625 |
14 | www.nytimes.com | 5005 | 156.40625 |
15 | www.commonchemistry.org | 4604 | 143.87500 |
16 | www.accessdata.fda.gov | 4352 | 136.00000 |
17 | www.youtube.com | 4154 | 129.81250 |
18 | emedicine.medscape.com | 3791 | 118.46875 |
19 | pubchem.ncbi.nlm.nih.gov | 3469 | 108.40625 |
# Total event count for top 20 hostnames by event type
# limited to W pages with external links
top_hosts_w_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_w_events = spark.sql(
top_hosts_w_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_w_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.wikipedia.org | 42334009 | 1.322938e+06 |
1 | en.m.wikipedia.org | 17636814 | 5.511504e+05 |
2 | www.imdb.com | 3050694 | 9.533419e+04 |
3 | tools.wmflabs.org | 1064153 | 3.325478e+04 |
4 | www.youtube.com | 601433 | 1.879478e+04 |
5 | books.google.com | 573246 | 1.791394e+04 |
6 | www.espncricinfo.com | 379919 | 1.187247e+04 |
7 | doi.org | 350714 | 1.095981e+04 |
8 | www.nytimes.com | 301568 | 9.424000e+03 |
9 | www.bbc.co.uk | 227540 | 7.110625e+03 |
10 | www.theguardian.com | 225815 | 7.056719e+03 |
11 | translate.googleusercontent.com | 184858 | 5.776812e+03 |
12 | www.instagram.com | 182944 | 5.717000e+03 |
13 | twitter.com | 180994 | 5.656062e+03 |
14 | www.google.com | 166936 | 5.216750e+03 |
15 | archive.org | 162174 | 5.067938e+03 |
16 | www.ncbi.nlm.nih.gov | 157171 | 4.911594e+03 |
17 | www.allmusic.com | 155615 | 4.862969e+03 |
18 | www.rottentomatoes.com | 140717 | 4.397406e+03 |
19 | www.biblegateway.com | 136031 | 4.250969e+03 |
# Total event count for top 20 hostnames
# limited to WP:M pages with external links
# further limited to extClick events
top_hosts_wpm_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
top_hosts_wpm_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_wpm_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | www.ncbi.nlm.nih.gov | 86432 | 2701.00000 |
1 | doi.org | 74930 | 2341.56250 |
2 | books.google.com | 19725 | 616.40625 |
3 | www.drugs.com | 16639 | 519.96875 |
4 | www.who.int | 15105 | 472.03125 |
5 | theranos.com | 10660 | 333.12500 |
6 | www.cdc.gov | 10119 | 316.21875 |
7 | chemapps.stolaf.edu | 7613 | 237.90625 |
8 | apps.who.int | 6384 | 199.50000 |
9 | www.nlm.nih.gov | 6291 | 196.59375 |
10 | www.fda.gov | 5101 | 159.40625 |
11 | www.nytimes.com | 5005 | 156.40625 |
12 | www.commonchemistry.org | 4604 | 143.87500 |
13 | www.accessdata.fda.gov | 4352 | 136.00000 |
14 | www.youtube.com | 4154 | 129.81250 |
15 | emedicine.medscape.com | 3791 | 118.46875 |
16 | translate.googleusercontent.com | 3553 | 111.03125 |
17 | pubchem.ncbi.nlm.nih.gov | 3469 | 108.40625 |
18 | www.cancer.gov | 3225 | 100.78125 |
19 | www.drugbank.ca | 2648 | 82.75000 |
# Total event count for top 20 hostnames by event type
# limited to W pages with external links
# further limited to extClick events
top_hosts_w_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_w_events = spark.sql(
top_hosts_w_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_w_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | www.imdb.com | 3050694 | 95334.18750 |
1 | tools.wmflabs.org | 1064153 | 33254.78125 |
2 | www.youtube.com | 601433 | 18794.78125 |
3 | books.google.com | 573062 | 17908.18750 |
4 | www.espncricinfo.com | 379919 | 11872.46875 |
5 | doi.org | 350714 | 10959.81250 |
6 | www.nytimes.com | 301568 | 9424.00000 |
7 | www.bbc.co.uk | 227540 | 7110.62500 |
8 | www.theguardian.com | 225815 | 7056.71875 |
9 | www.instagram.com | 182944 | 5717.00000 |
10 | twitter.com | 180994 | 5656.06250 |
11 | www.google.com | 166936 | 5216.75000 |
12 | archive.org | 162153 | 5067.28125 |
13 | www.ncbi.nlm.nih.gov | 157171 | 4911.59375 |
14 | www.allmusic.com | 155615 | 4862.96875 |
15 | www.rottentomatoes.com | 140717 | 4397.40625 |
16 | www.findagrave.com | 130021 | 4063.15625 |
17 | news.bbc.co.uk | 129189 | 4037.15625 |
18 | www.telegraph.co.uk | 115366 | 3605.18750 |
19 | www.facebook.com | 111813 | 3494.15625 |
# Total event count for top 20 hostnames
# limited to WP:M pages with external links
# further limited to fnHover events
top_hosts_wpm_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action = 'fnHover'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
top_hosts_wpm_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_wpm_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.wikipedia.org | 1465057 | 45783.03125 |
1 | en.m.wikipedia.org | 89821 | 2806.90625 |
2 | translate.googleusercontent.com | 4032 | 126.00000 |
3 | None | 698 | 21.81250 |
4 | www.translatoruser-int.com | 149 | 4.65625 |
5 | papago.naver.net | 99 | 3.09375 |
6 | www.biblegateway.com | 39 | 1.21875 |
7 | z5h64q92x9.net | 21 | 0.65625 |
8 | scholar.google.com | 17 | 0.53125 |
9 | emedien3.sub.uni-hamburg.de | 12 | 0.37500 |
10 | en.wikipedi0.org | 5 | 0.15625 |
11 | translate.baiducontent.com | 4 | 0.12500 |
12 | en.m.wikipedi0.org | 3 | 0.09375 |
13 | www.gpedia.com | 2 | 0.06250 |
14 | en.wikipedia.org. | 2 | 0.06250 |
15 | books.google.com | 1 | 0.03125 |
16 | speechpanel.readspeaker.com | 1 | 0.03125 |
17 | nl.wikiludia.com | 1 | 0.03125 |
18 | en.wk.jsproxy.tk | 1 | 0.03125 |
# Total event count for top 20 hostnames by event type
# limited to W pages with external links
# further limited to fnHover events
top_hosts_w_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action = 'fnHover'
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_w_events = spark.sql(
top_hosts_w_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_w_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.wikipedia.org | 33809467 | 1.056546e+06 |
1 | en.m.wikipedia.org | 2015284 | 6.297762e+04 |
2 | translate.googleusercontent.com | 53714 | 1.678562e+03 |
3 | None | 27224 | 8.507500e+02 |
4 | www.biblegateway.com | 16145 | 5.045312e+02 |
5 | www.translatoruser-int.com | 2063 | 6.446875e+01 |
6 | papago.naver.net | 1250 | 3.906250e+01 |
7 | z5h64q92x9.net | 327 | 1.021875e+01 |
8 | en.wikipedi0.org | 246 | 7.687500e+00 |
9 | emedien3.sub.uni-hamburg.de | 169 | 5.281250e+00 |
10 | books.google.com | 104 | 3.250000e+00 |
11 | translate.baiducontent.com | 102 | 3.187500e+00 |
12 | www.gutenberg.org | 80 | 2.500000e+00 |
13 | fsfe.org | 32 | 1.000000e+00 |
14 | en.wikipedia.org. | 29 | 9.062500e-01 |
15 | wn.rsarchive.org | 28 | 8.750000e-01 |
16 | en.wk.jsproxy.tk | 28 | 8.750000e-01 |
17 | en.wikiludia.com | 26 | 8.125000e-01 |
18 | en.wikipedia.nom.rs | 13 | 4.062500e-01 |
19 | www.gpedia.com | 12 | 3.750000e-01 |
# Total event count for top 20 hostnames
# limited to WP:M pages with external links
# further limited to extClick, fnClick OR upClick events
top_hosts_wpm_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action in ('extClick', 'upClick','fnClick')
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_wpm_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
top_hosts_wpm_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_wpm_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.m.wikipedia.org | 537488 | 16796.50000 |
1 | en.wikipedia.org | 395629 | 12363.40625 |
2 | www.ncbi.nlm.nih.gov | 86432 | 2701.00000 |
3 | doi.org | 74930 | 2341.56250 |
4 | books.google.com | 19728 | 616.50000 |
5 | www.drugs.com | 16639 | 519.96875 |
6 | www.who.int | 15105 | 472.03125 |
7 | theranos.com | 10660 | 333.12500 |
8 | www.cdc.gov | 10119 | 316.21875 |
9 | chemapps.stolaf.edu | 7613 | 237.90625 |
10 | translate.googleusercontent.com | 6743 | 210.71875 |
11 | apps.who.int | 6384 | 199.50000 |
12 | www.nlm.nih.gov | 6291 | 196.59375 |
13 | www.fda.gov | 5101 | 159.40625 |
14 | www.nytimes.com | 5005 | 156.40625 |
15 | www.commonchemistry.org | 4604 | 143.87500 |
16 | www.accessdata.fda.gov | 4352 | 136.00000 |
17 | www.youtube.com | 4154 | 129.81250 |
18 | emedicine.medscape.com | 3791 | 118.46875 |
19 | pubchem.ncbi.nlm.nih.gov | 3469 | 108.40625 |
# Total event count for top 20 hostnames by event type
# limited to W pages with external links
# further limited to extClick, fnClick OR upClick events
top_hosts_w_query = """
SELECT {} AS host, COUNT(*) AS total_events, COUNT(*)/{} AS daily_average_events
FROM citationusage
WHERE wiki = 'enwiki'
AND action in ('extClick', 'upClick','fnClick')
AND page_id IN
(SELECT DISTINCT page_id
FROM ryanmax.population_w_pages_with_extlinks
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_w_events = spark.sql(
top_hosts_w_query.format(
parse_host.format('link_url'),
days_in_study,
event_exclusion_sql, start_date_string, end_date_string
))
top_hosts_w_events.toPandas()
host | total_events | daily_average_events | |
---|---|---|---|
0 | en.m.wikipedia.org | 15621530 | 488172.81250 |
1 | en.wikipedia.org | 8524542 | 266391.93750 |
2 | www.imdb.com | 3050694 | 95334.18750 |
3 | tools.wmflabs.org | 1064153 | 33254.78125 |
4 | www.youtube.com | 601433 | 18794.78125 |
5 | books.google.com | 573142 | 17910.68750 |
6 | www.espncricinfo.com | 379919 | 11872.46875 |
7 | doi.org | 350714 | 10959.81250 |
8 | www.nytimes.com | 301568 | 9424.00000 |
9 | www.bbc.co.uk | 227540 | 7110.62500 |
10 | www.theguardian.com | 225815 | 7056.71875 |
11 | www.instagram.com | 182944 | 5717.00000 |
12 | twitter.com | 180994 | 5656.06250 |
13 | www.google.com | 166936 | 5216.75000 |
14 | archive.org | 162164 | 5067.62500 |
15 | www.ncbi.nlm.nih.gov | 157171 | 4911.59375 |
16 | www.allmusic.com | 155615 | 4862.96875 |
17 | www.rottentomatoes.com | 140717 | 4397.40625 |
18 | translate.googleusercontent.com | 131144 | 4098.25000 |
19 | www.findagrave.com | 130021 | 4063.15625 |
# wikipedia.org links by event type
top_wiki_hosts_query = """
SELECT {} AS host, action, COUNT(*) AS count
FROM citationusage
WHERE wiki = 'enwiki'
AND lower(parse_url(link_url,'HOST')) like '%wikipedia.org'
AND (
page_id IN
(SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
OR page_id IN
(SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host, action
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_wiki_hosts_events = spark.sql(
top_wiki_hosts_query.format(
parse_host.format('link_url'),
event_exclusion_sql, start_date_string, end_date_string
))
top_wiki_hosts_events_rdd = top_wiki_hosts_events.rdd
top_wiki_hosts_events_df = sqlContext.createDataFrame(top_wiki_hosts_events_rdd)
top_wiki_hosts_events_pandas = top_wiki_hosts_events_df.toPandas()
top_wiki_hosts_events_pandas.pivot(index='host', columns='action', values='count')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
host | ||||
ar.wikipedia.org | 276.0 | NaN | NaN | NaN |
de.wikipedia.org | 1798.0 | NaN | NaN | NaN |
en.m.wikipedia.org | NaN | 16044275.0 | 2105104.0 | 114695.0 |
en.wikipedia.org | 262.0 | 7875877.0 | 35274475.0 | 1043977.0 |
eo.wikipedia.org | 280.0 | NaN | NaN | NaN |
es.wikipedia.org | 785.0 | NaN | NaN | NaN |
fr.wikipedia.org | 1804.0 | NaN | NaN | NaN |
it.wikipedia.org | 646.0 | NaN | NaN | NaN |
ja.wikipedia.org | 1973.0 | NaN | NaN | NaN |
nl.wikipedia.org | 1062.0 | NaN | NaN | NaN |
pt.wikipedia.org | 383.0 | NaN | NaN | NaN |
ru.wikipedia.org | 1092.0 | NaN | NaN | NaN |
simple.wikipedia.org | 563.0 | NaN | NaN | NaN |
www.wikipedia.org | 941.0 | NaN | NaN | NaN |
zh.wikipedia.org | 1171.0 | NaN | NaN | NaN |
# web.archive.org links by event type for W:PM pages withn external links
archive_org_wpm_query = """
SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count
FROM citationusage
WHERE wiki = 'enwiki'
AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'
AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'
AND page_id IN
(SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host, action
"""
archive_org_wpm_events = spark.sql(
archive_org_wpm_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
archive_org_wpm_events_rdd = archive_org_wpm_events.rdd
archive_org_wpm_events_df = sqlContext.createDataFrame(archive_org_wpm_events_rdd)
archive_org_wpm_events_pandas = archive_org_wpm_events_df.toPandas()
archive_org_wpm_events_pandas.pivot(index='host', columns='action', values='count')
action | extClick | fnClick |
---|---|---|
host | ||
web.archive.org | 51898 | 3 |
# web.archive.org links by event type for W pages withn external links
archive_org_w_query = """
SELECT PARSE_URL(LOWER(link_url),'HOST') AS host, action, COUNT(*) AS count
FROM citationusage
WHERE wiki = 'enwiki'
AND LOWER(link_url) RLIKE '^https?://web.archive.org/web/'
AND PARSE_URL(LOWER(link_url),'HOST') = 'web.archive.org'
AND page_id IN
(SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY host, action
"""
archive_org_w_events = spark.sql(
archive_org_w_query.format(
event_exclusion_sql, start_date_string, end_date_string
))
archive_org_w_events_rdd = archive_org_w_events.rdd
archive_org_w_events_df = sqlContext.createDataFrame(archive_org_w_events_rdd)
archive_org_w_events_pandas = archive_org_w_events_df.toPandas()
archive_org_w_events_pandas.pivot(index='host', columns='action', values='count')
action | extClick | fnClick | fnHover | upClick |
---|---|---|---|---|
host | ||||
web.archive.org | 1963023 | 31 | 57 | 1 |