➤ *Please view this notebook on nbviewer*
For various baseline measurements for evaluation of Automoderator, we want to develop a criteria to identify potential vandalism. In this analysis the criteria will be compared with the revert risk scores. Starting with an set an intial set, different dimensions will be used to see how that impacts the median revert risk score by project and also how restricting the criteria further elimiates edits from consideration. The goal is find a balance between good median score, without eliminating too many edits from consideration.
Initial criteria:
Dimensions considered
Based on the analysis, the following additions/modifications can improve the median risk score
pr_centered('Changes in the Median Risk & Number of Edits', True)
display_h(results)
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.901974 | 16829 |
1 | enwiki | 0.910679 | 172584 |
2 | eswiki | 0.922596 | 55105 |
3 | fawiki | 0.916366 | 9967 |
4 | frwiki | 0.903316 | 19375 |
5 | idwiki | 0.902464 | 3554 |
6 | itwiki | 0.919648 | 23440 |
7 | jawiki | 0.875682 | 10170 |
8 | ptwiki | 0.913064 | 3361 |
9 | ruwiki | 0.914291 | 23587 |
10 | zhwiki | 0.883454 | 7568 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.904239 | 16077 |
1 | enwiki | 0.912205 | 162439 |
2 | eswiki | 0.923474 | 52922 |
3 | fawiki | 0.916792 | 9228 |
4 | frwiki | 0.905588 | 18401 |
5 | idwiki | 0.901994 | 3231 |
6 | itwiki | 0.921301 | 22077 |
7 | jawiki | 0.879789 | 9401 |
8 | ptwiki | 0.914363 | 3147 |
9 | ruwiki | 0.916403 | 22250 |
10 | zhwiki | 0.886989 | 6880 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.904503 | 16061 |
1 | enwiki | 0.912847 | 160889 |
2 | eswiki | 0.923850 | 52696 |
3 | fawiki | 0.918056 | 9136 |
4 | frwiki | 0.906304 | 18285 |
5 | idwiki | 0.902892 | 3190 |
6 | itwiki | 0.921365 | 22011 |
7 | jawiki | 0.880116 | 9109 |
8 | ptwiki | 0.916916 | 3079 |
9 | ruwiki | 0.916746 | 22204 |
10 | zhwiki | 0.887588 | 6819 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.907555 | 15468 |
1 | enwiki | 0.915196 | 153858 |
2 | eswiki | 0.924792 | 51696 |
3 | fawiki | 0.920468 | 8539 |
4 | frwiki | 0.909034 | 17489 |
5 | idwiki | 0.905071 | 3067 |
6 | itwiki | 0.922709 | 21633 |
7 | jawiki | 0.882525 | 8828 |
8 | ptwiki | 0.930669 | 2458 |
9 | ruwiki | 0.918103 | 21661 |
10 | zhwiki | 0.890380 | 6481 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.917214 | 11281 |
1 | enwiki | 0.920194 | 115997 |
2 | eswiki | 0.930483 | 39239 |
3 | fawiki | 0.924352 | 6734 |
4 | frwiki | 0.913709 | 13492 |
5 | idwiki | 0.910019 | 2361 |
6 | itwiki | 0.924533 | 15505 |
7 | jawiki | 0.883670 | 6679 |
8 | ptwiki | 0.934228 | 1855 |
9 | ruwiki | 0.923788 | 16914 |
10 | zhwiki | 0.896337 | 4813 |
import pandas as pd
import numpy as np
import wmfdata as wmf
pd.options.display.max_columns = None
from IPython.display import clear_output
import warnings
import random
from datetime import datetime
from IPython.display import display_html
from IPython.display import display, HTML
from IPython.display import clear_output
# import seaborn as sns
# import matplotlib.pyplot as plt
spark_session = wmf.spark.get_active_session()
if type(spark_session) != type(None):
spark_session.stop()
else:
print('no active session')
spark_session = wmf.spark.create_custom_session(
master="yarn",
app_name='vandal-criteria-comparision',
spark_config={
"spark.driver.memory": "6g",
"spark.dynamicAllocation.maxExecutors": 64,
"spark.executor.memory": "24g",
"spark.executor.cores": 4,
"spark.sql.shuffle.partitions": 256,
"spark.driver.maxResultSize": "2g"
}
)
clear_output()
spark_session.sparkContext.setLogLevel("ERROR")
spark_session
SparkSession - hive
rr_scores_path = '/user/paragon/riskobservatory/revertrisk_20212022_anonymous_bot.parquet'
rr_scores = spark_session.read.parquet(rr_scores_path)
rr_scores.createOrReplaceTempView('rr_scores')
rr_scores.printSchema()
[Stage 0:> (0 + 1) / 1]
root |-- rev_id: long (nullable = true) |-- wiki_db: string (nullable = true) |-- rev_timestamp: string (nullable = true) |-- revision_is_identity_reverted: boolean (nullable = true) |-- revision_seconds_to_identity_revert: long (nullable = true) |-- page_id: long (nullable = true) |-- revision_revert_risk: float (nullable = true) |-- user_is_anonymous: boolean (nullable = true) |-- user_is_bot: boolean (nullable = true)
mwh_snapshot = '2023-10'
wikis_list = [f'{lang}wiki' for lang in ['en', 'es', 'ja', 'de', 'fr', 'ru', 'zh', 'it', 'pt', 'fa', 'id']]
wikis_sql = wmf.utils.sql_tuple(wikis_list)
# generate 30 random dates in an year
def generate_random_dates(year, num_dates):
dates = []
for _ in range(num_dates):
month = random.randint(1, 12)
if month in [1, 3, 5, 7, 8, 10, 12]:
day = random.randint(1, 31)
elif month == 2:
day = random.randint(1, 28)
else:
day = random.randint(1, 30)
date = datetime(year, month, day)
dates.append(date.strftime("%Y-%m-%d"))
return dates
random_dates_2022 = generate_random_dates(2022, 30)
random_dates_2022_sql = wmf.utils.sql_tuple(random_dates_2022)
%%time
query = f"""
WITH
base_criteria AS (
SELECT
mwh.wiki_db,
rr.rev_id,
revision_revert_risk AS risk,
mwh.event_user_text AS user_name,
event_timestamp AS rev_ts,
event_user_is_anonymous AS is_anon,
event_user_revision_count AS user_edit_count,
COALESCE(event_user_registration_timestamp, event_user_creation_timestamp) AS user_reg_ts,
event_user_first_edit_timestamp AS user_first_rev_ts,
event_user_seconds_since_previous_revision AS time_user_prev_rev,
page_seconds_since_previous_revision AS time_page_prev_rev,
revision_text_bytes_diff AS rev_bytes_diff,
mwh.revision_seconds_to_identity_revert AS time_to_revert,
revision_text_bytes AS rev_bytes,
revision_is_identity_revert AS reverting_edit,
revision_first_identity_reverting_revision_id AS reverting_edit_id
FROM
rr_scores rr
JOIN
wmf.mediawiki_history mwh
ON rr.wiki_db = mwh.wiki_db AND rr.rev_id = mwh.revision_id
WHERE
snapshot = '{mwh_snapshot}'
AND rr.wiki_db IN {wikis_sql}
AND event_entity = 'revision'
AND event_type = 'create'
AND DATE(event_timestamp) IN {random_dates_2022_sql}
AND page_namespace_is_content
AND (event_user_is_anonymous OR event_user_revision_count <= 250)
AND SIZE(event_user_is_bot_by_historical) = 0
AND mwh.revision_is_identity_reverted
AND mwh.revision_seconds_to_identity_revert <= 3*24*60*60
)
SELECT
bc.*,
mwh.event_user_is_anonymous AS reverting_user_is_anon,
mwh.event_user_revision_count AS reverting_user_edit_count,
mwh.event_user_first_edit_timestamp AS reverting_user_first_rev_ts,
mwh.revision_is_identity_reverted AS is_revert_reverted,
mwh.revision_seconds_to_identity_revert AS revert_time_to_revert
FROM
base_criteria bc
JOIN
wmf.mediawiki_history mwh
ON bc.wiki_db = mwh.wiki_db AND bc.reverting_edit_id = mwh.revision_id
WHERE
snapshot = '{mwh_snapshot}'
AND NOT bc.user_name = mwh.event_user_text
"""
edits = wmf.spark.run(query)
2]
CPU times: user 5.2 s, sys: 0 ns, total: 5.2 s Wall time: 5min 16s
edits = (
edits
.assign(
rev_ts=pd.to_datetime(edits['rev_ts'], utc=True),
user_reg_ts=pd.to_datetime(edits['user_reg_ts'], utc=True),
user_first_rev_ts=pd.to_datetime(edits['user_first_rev_ts'], utc=True),
reverting_user_first_rev_ts=pd.to_datetime(edits['reverting_user_first_rev_ts'], utc=True),
is_anon=pd.Categorical(edits['is_anon']),
reverting_user_is_anon=pd.Categorical(edits['reverting_user_is_anon']),
is_revert_reverted=pd.Categorical(edits['is_revert_reverted'])
)
)
edits.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 391096 entries, 0 to 391095 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 wiki_db 391096 non-null object 1 rev_id 391096 non-null int64 2 risk 391096 non-null float32 3 user_name 391096 non-null object 4 rev_ts 391096 non-null datetime64[ns, UTC] 5 is_anon 391096 non-null category 6 user_edit_count 92095 non-null float64 7 user_reg_ts 92053 non-null datetime64[ns, UTC] 8 user_first_rev_ts 92095 non-null datetime64[ns, UTC] 9 time_user_prev_rev 75259 non-null float64 10 time_page_prev_rev 391096 non-null int64 11 rev_bytes_diff 387254 non-null float64 12 time_to_revert 391096 non-null int64 13 rev_bytes 387436 non-null float64 14 reverting_edit 391096 non-null bool 15 reverting_edit_id 391096 non-null int64 16 reverting_user_is_anon 391096 non-null category 17 reverting_user_edit_count 376252 non-null float64 18 reverting_user_first_rev_ts 376252 non-null datetime64[ns, UTC] 19 is_revert_reverted 391096 non-null category 20 revert_time_to_revert 48583 non-null float64 dtypes: bool(1), category(3), datetime64[ns, UTC](4), float32(1), float64(6), int64(4), object(2) memory usage: 50.7+ MB
# prints a string at center of the output, bold if needed
def pr_centered(content, bold=False):
if bold:
content = f"<b>{content}</b>"
centered_html = f"<div style='text-align:center'>{content}</div>"
display(HTML(centered_html))
# display dataframes horizontally with title for each
def display_h(frames, space=100):
html = ""
for key in frames.keys():
html_df =f'<div>{key} {frames[key]._repr_html_()}</div>'
html += html_df
html = f"""
<div style="display:flex; justify-content: space-evenly;">
{html}
</div>"""
display_html(html, raw=True)
def calculate_grouped(df, intervals, pivot_column, columns_title=None, column_names=None, target_column='risk', group_column='wiki_db', grp_function='median'):
final_results = []
for interval in intervals:
# unlike other temporal columns, bytes difference should be greater than given value
if pivot_column == 'rev_bytes_diff':
df[pivot_column] = df[pivot_column].abs()
filtered_df = df[df[pivot_column] >= interval]
else:
filtered_df = df[df[pivot_column] <= interval]
grouped = filtered_df.groupby(group_column).agg({target_column: grp_function}).reset_index()
grouped['interval'] = interval
final_results.append(grouped)
concatenated_df = pd.concat(final_results)
pivot_df = concatenated_df.pivot(index=group_column, columns='interval', values=target_column)
if columns_title == None:
pivot_df.columns.name = f'median: {pivot_column}'
else:
pivot_df.columns.name = f'median: {columns_title}'
if column_names != None:
pivot_df.columns = column_names
return pivot_df
# def plot_hmap(df, x_label, title, fontsize=10, y_label='Wikipedia', cbar_label='Median Risk'):
# ax = sns.heatmap(df, annot=True, annot_kws={"size": fontsize})
# # set labels
# ax.set_xlabel(x_label, fontsize=fontsize)
# ax.set_ylabel(y_label, fontsize=fontsize)
# ax.set_title(title, fontsize=fontsize + 1)
# # color bar properties
# cbar = ax.collections[0].colorbar
# cbar.set_label(cbar_label, fontsize=fontsize)
# cbar.ax.tick_params(labelsize=fontsize)
# plt.show()
def time_delta(df, start_column, end_column):
try:
return df.apply(lambda row: (row[end_column] - row[start_column]).total_seconds(), axis=1)
except:
return np.NaN
init_criteria = edits.query("""(time_to_revert <= 24*60*60) & ((is_anon == True) | (user_edit_count <= 25))""")
init_criteria = (
init_criteria
.assign(
elapsed_reg=time_delta(init_criteria, 'user_reg_ts', 'rev_ts'),
elapsed_first_rev=time_delta(init_criteria, 'user_first_rev_ts', 'rev_ts'),
rv_user_elapsed_first_rev=time_delta(init_criteria, 'reverting_user_first_rev_ts', 'rev_ts')
)
)
init_criteria_risk = (
init_criteria
.groupby('wiki_db')
.agg({
'risk': 'median',
'rev_id': 'count'
})
.reset_index()
.rename({
'rev_id': 'n_edits',
'risk': 'median_risk'
}, axis=1)
)
init_criteria_risk
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.901974 | 16829 |
1 | enwiki | 0.910679 | 172584 |
2 | eswiki | 0.922596 | 55105 |
3 | fawiki | 0.916366 | 9967 |
4 | frwiki | 0.903316 | 19375 |
5 | idwiki | 0.902464 | 3554 |
6 | itwiki | 0.919648 | 23440 |
7 | jawiki | 0.875682 | 10170 |
8 | ptwiki | 0.913064 | 3361 |
9 | ruwiki | 0.914291 | 23587 |
10 | zhwiki | 0.883454 | 7568 |
ttr_hour_intervals = [1, 2, 4, 8, 12, 24]
ttr_time_intervals = [i*60*60 for i in ttr_hour_intervals]
ttr_column_names = [f'{i} hr' for i in ttr_hour_intervals]
ttr_median_risk = calculate_grouped(init_criteria, ttr_time_intervals,
'time_to_revert', column_names=ttr_column_names)
ttr_interval_counts = calculate_grouped(init_criteria, ttr_time_intervals,
'time_to_revert', column_names=ttr_column_names, grp_function = 'count')
display_h({
'Median Risk': ttr_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': ttr_interval_counts
})
1 hr | 2 hr | 4 hr | 8 hr | 12 hr | 24 hr | |
---|---|---|---|---|---|---|
wiki_db | ||||||
dewiki | 0.910 | 0.908 | 0.906 | 0.905 | 0.904 | 0.902 |
enwiki | 0.920 | 0.918 | 0.915 | 0.913 | 0.912 | 0.911 |
eswiki | 0.928 | 0.926 | 0.925 | 0.924 | 0.923 | 0.923 |
fawiki | 0.918 | 0.918 | 0.918 | 0.917 | 0.917 | 0.916 |
frwiki | 0.913 | 0.911 | 0.909 | 0.907 | 0.906 | 0.903 |
idwiki | 0.900 | 0.901 | 0.899 | 0.902 | 0.902 | 0.902 |
itwiki | 0.927 | 0.926 | 0.924 | 0.922 | 0.921 | 0.920 |
jawiki | 0.894 | 0.891 | 0.886 | 0.882 | 0.880 | 0.876 |
ptwiki | 0.920 | 0.918 | 0.917 | 0.916 | 0.914 | 0.913 |
ruwiki | 0.926 | 0.923 | 0.920 | 0.918 | 0.916 | 0.914 |
zhwiki | 0.896 | 0.893 | 0.891 | 0.889 | 0.887 | 0.883 |
1 hr | 2 hr | 4 hr | 8 hr | 12 hr | 24 hr | |
---|---|---|---|---|---|---|
wiki_db | ||||||
dewiki | 13349 | 14151 | 14974 | 15661 | 16077 | 16829 |
enwiki | 114940 | 128218 | 141591 | 155008 | 162439 | 172584 |
eswiki | 42487 | 45577 | 48656 | 51468 | 52922 | 55105 |
fawiki | 6798 | 7417 | 8123 | 8816 | 9228 | 9967 |
frwiki | 14078 | 15335 | 16506 | 17687 | 18401 | 19375 |
idwiki | 1662 | 2070 | 2550 | 3006 | 3231 | 3554 |
itwiki | 16739 | 18198 | 19752 | 21189 | 22077 | 23440 |
jawiki | 6351 | 7245 | 8150 | 8943 | 9401 | 10170 |
ptwiki | 2081 | 2347 | 2686 | 2985 | 3147 | 3361 |
ruwiki | 15851 | 17794 | 19570 | 21248 | 22250 | 23587 |
zhwiki | 4071 | 4823 | 5637 | 6446 | 6880 | 7568 |
Limiting to 8 hr window provides a slight improvement without eliminating a lot of edits.
edit_count_intervals = [5, 10, 15, 20, 25]
edit_count_column_names = [f'{i} edits' for i in edit_count_intervals]
edit_count_median_risk = calculate_grouped(init_criteria, edit_count_intervals,
'user_edit_count', column_names=edit_count_column_names)
edit_count_interval_counts = calculate_grouped(init_criteria, edit_count_intervals,
'user_edit_count', column_names=edit_count_column_names, grp_function='count')
display_h({
'Median Risk': edit_count_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': edit_count_interval_counts
})
5 edits | 10 edits | 15 edits | 20 edits | 25 edits | |
---|---|---|---|---|---|
wiki_db | |||||
dewiki | 0.900 | 0.892 | 0.885 | 0.880 | 0.876 |
enwiki | 0.924 | 0.918 | 0.913 | 0.910 | 0.908 |
eswiki | 0.936 | 0.931 | 0.929 | 0.926 | 0.924 |
fawiki | 0.929 | 0.923 | 0.920 | 0.915 | 0.911 |
frwiki | 0.919 | 0.910 | 0.903 | 0.897 | 0.893 |
idwiki | 0.906 | 0.893 | 0.891 | 0.889 | 0.886 |
itwiki | 0.921 | 0.914 | 0.908 | 0.905 | 0.904 |
jawiki | 0.920 | 0.916 | 0.909 | 0.904 | 0.899 |
ptwiki | 0.923 | 0.919 | 0.917 | 0.914 | 0.913 |
ruwiki | 0.921 | 0.914 | 0.910 | 0.906 | 0.903 |
zhwiki | 0.909 | 0.903 | 0.900 | 0.897 | 0.892 |
5 edits | 10 edits | 15 edits | 20 edits | 25 edits | |
---|---|---|---|---|---|
wiki_db | |||||
dewiki | 1560 | 1893 | 2086 | 2200 | 2301 |
enwiki | 22503 | 28206 | 31350 | 33433 | 34986 |
eswiki | 3898 | 4866 | 5345 | 5655 | 5851 |
fawiki | 1227 | 1702 | 2018 | 2259 | 2442 |
frwiki | 2398 | 2944 | 3252 | 3463 | 3611 |
idwiki | 268 | 383 | 443 | 495 | 527 |
itwiki | 1230 | 1514 | 1647 | 1745 | 1820 |
jawiki | 1342 | 1889 | 2239 | 2484 | 2691 |
ptwiki | 2345 | 2848 | 3079 | 3236 | 3361 |
ruwiki | 1971 | 2373 | 2594 | 2737 | 2833 |
zhwiki | 863 | 1201 | 1393 | 1510 | 1587 |
Limiting to 15 edits slightly improves the scores without elimating a lot of edits.
non_anon = init_criteria.query("""is_anon == False""").reset_index(drop=True)
elapsed_reg_minutes = [1, 5, 30]
elapsed_reg_hours = [1, 2, 4, 12, 24, 48, 72, non_anon.elapsed_reg.max()/60*60]
elapsed_reg_time_intervals = [i*60 for i in elapsed_reg_minutes] + [i*60*60 for i in elapsed_reg_hours]
elapsed_reg_column_names = [f'{i} min' for i in elapsed_reg_minutes] + [f'{i} hr' if i<=72 else 'max' for i in elapsed_reg_hours]
elapsed_reg_median_risk = calculate_grouped(non_anon, elapsed_reg_time_intervals,
'elapsed_reg', column_names=elapsed_reg_column_names)
elapsed_reg_interval_counts = calculate_grouped(non_anon, elapsed_reg_time_intervals,
'elapsed_reg', column_names=elapsed_reg_column_names, grp_function='count')
display_h({
'Median Risk': elapsed_reg_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': elapsed_reg_interval_counts
})
1 min | 5 min | 30 min | 1 hr | 2 hr | 4 hr | 12 hr | 24 hr | 48 hr | 72 hr | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||||
dewiki | 0.953 | 0.936 | 0.930 | 0.930 | 0.929 | 0.929 | 0.928 | 0.927 | 0.926 | 0.924 | 0.876 |
enwiki | 0.937 | 0.941 | 0.938 | 0.936 | 0.935 | 0.934 | 0.934 | 0.933 | 0.932 | 0.931 | 0.908 |
eswiki | 0.962 | 0.949 | 0.946 | 0.945 | 0.944 | 0.944 | 0.943 | 0.943 | 0.940 | 0.939 | 0.924 |
fawiki | 0.929 | 0.943 | 0.944 | 0.944 | 0.943 | 0.943 | 0.942 | 0.940 | 0.938 | 0.936 | 0.911 |
frwiki | 0.945 | 0.940 | 0.933 | 0.931 | 0.930 | 0.930 | 0.930 | 0.930 | 0.928 | 0.927 | 0.893 |
idwiki | 0.932 | 0.934 | 0.918 | 0.915 | 0.909 | 0.909 | 0.909 | 0.911 | 0.906 | 0.905 | 0.886 |
itwiki | 0.968 | 0.947 | 0.939 | 0.940 | 0.940 | 0.940 | 0.938 | 0.938 | 0.935 | 0.935 | 0.904 |
jawiki | 0.950 | 0.935 | 0.928 | 0.926 | 0.924 | 0.923 | 0.921 | 0.922 | 0.921 | 0.919 | 0.899 |
ptwiki | 0.918 | 0.935 | 0.937 | 0.936 | 0.935 | 0.935 | 0.935 | 0.934 | 0.934 | 0.933 | 0.913 |
ruwiki | 0.942 | 0.939 | 0.934 | 0.934 | 0.934 | 0.934 | 0.932 | 0.931 | 0.929 | 0.928 | 0.903 |
zhwiki | 0.966 | 0.938 | 0.932 | 0.933 | 0.932 | 0.932 | 0.928 | 0.928 | 0.922 | 0.920 | 0.892 |
1 min | 5 min | 30 min | 1 hr | 2 hr | 4 hr | 12 hr | 24 hr | 48 hr | 72 hr | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||||
dewiki | 40 | 350 | 855 | 995 | 1050 | 1094 | 1149 | 1191 | 1247 | 1295 | 2299 |
enwiki | 794 | 6181 | 14757 | 16826 | 18287 | 19213 | 20335 | 21605 | 22581 | 23170 | 34976 |
eswiki | 159 | 1278 | 2855 | 3198 | 3459 | 3578 | 3733 | 3981 | 4212 | 4338 | 5851 |
fawiki | 8 | 247 | 772 | 936 | 1038 | 1103 | 1164 | 1258 | 1331 | 1370 | 2442 |
frwiki | 53 | 652 | 1557 | 1766 | 1906 | 1995 | 2083 | 2160 | 2260 | 2318 | 3611 |
idwiki | 8 | 56 | 153 | 194 | 230 | 242 | 246 | 270 | 284 | 296 | 527 |
itwiki | 69 | 434 | 833 | 925 | 1004 | 1048 | 1097 | 1152 | 1212 | 1227 | 1820 |
jawiki | 99 | 645 | 1582 | 1726 | 1817 | 1876 | 1950 | 2016 | 2075 | 2111 | 2691 |
ptwiki | 22 | 673 | 1697 | 1946 | 2104 | 2185 | 2261 | 2319 | 2370 | 2410 | 3361 |
ruwiki | 46 | 544 | 1353 | 1531 | 1627 | 1690 | 1751 | 1841 | 1907 | 1939 | 2833 |
zhwiki | 21 | 238 | 653 | 765 | 794 | 848 | 910 | 964 | 1044 | 1072 | 1587 |
Limiting to 48 hr window significantly improves the scores. However, this only when registered users are considered.
elapsed_first_rev_minutes = [1, 5, 30]
elapsed_first_rev_hours = [1, 2, 4, 12, 24, 48, 72, non_anon.elapsed_first_rev.max()/60*60]
elapsed_first_rev_time_intervals = [i*60 for i in elapsed_first_rev_minutes] + [i*60*60 for i in elapsed_first_rev_hours]
elapsed_first_rev_column_names = [f'{i} min' for i in elapsed_first_rev_minutes] + [f'{i} hr' if i<=72 else 'max' for i in elapsed_first_rev_hours]
elapsed_first_rev_median_risk = calculate_grouped(non_anon, elapsed_first_rev_time_intervals,
'elapsed_first_rev', column_names=elapsed_first_rev_column_names)
elapsed_first_rev_counts = calculate_grouped(non_anon, elapsed_first_rev_time_intervals,
'elapsed_first_rev', column_names=elapsed_first_rev_column_names, grp_function='count')
display_h({
'Median Risk': elapsed_first_rev_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': elapsed_first_rev_counts
})
1 min | 5 min | 30 min | 1 hr | 2 hr | 4 hr | 12 hr | 24 hr | 48 hr | 72 hr | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||||
dewiki | 0.916 | 0.917 | 0.917 | 0.916 | 0.915 | 0.914 | 0.913 | 0.913 | 0.910 | 0.908 | 0.876 |
enwiki | 0.930 | 0.933 | 0.932 | 0.931 | 0.930 | 0.930 | 0.929 | 0.928 | 0.926 | 0.925 | 0.908 |
eswiki | 0.939 | 0.941 | 0.941 | 0.941 | 0.940 | 0.940 | 0.939 | 0.939 | 0.937 | 0.936 | 0.924 |
fawiki | 0.932 | 0.934 | 0.939 | 0.938 | 0.937 | 0.937 | 0.934 | 0.932 | 0.930 | 0.930 | 0.911 |
frwiki | 0.924 | 0.927 | 0.927 | 0.926 | 0.925 | 0.925 | 0.925 | 0.923 | 0.921 | 0.921 | 0.893 |
idwiki | 0.916 | 0.915 | 0.907 | 0.905 | 0.904 | 0.904 | 0.903 | 0.903 | 0.901 | 0.896 | 0.886 |
itwiki | 0.932 | 0.932 | 0.931 | 0.934 | 0.934 | 0.932 | 0.931 | 0.929 | 0.929 | 0.928 | 0.904 |
jawiki | 0.935 | 0.931 | 0.925 | 0.924 | 0.922 | 0.921 | 0.917 | 0.917 | 0.916 | 0.915 | 0.899 |
ptwiki | 0.925 | 0.930 | 0.932 | 0.932 | 0.932 | 0.932 | 0.931 | 0.931 | 0.930 | 0.929 | 0.913 |
ruwiki | 0.925 | 0.929 | 0.927 | 0.927 | 0.928 | 0.927 | 0.927 | 0.926 | 0.925 | 0.924 | 0.903 |
zhwiki | 0.903 | 0.920 | 0.925 | 0.926 | 0.924 | 0.922 | 0.921 | 0.921 | 0.917 | 0.912 | 0.892 |
1 min | 5 min | 30 min | 1 hr | 2 hr | 4 hr | 12 hr | 24 hr | 48 hr | 72 hr | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||||
dewiki | 763 | 937 | 1222 | 1300 | 1338 | 1375 | 1424 | 1460 | 1527 | 1568 | 2301 |
enwiki | 9917 | 13471 | 19062 | 20548 | 21640 | 22355 | 23261 | 24472 | 25405 | 25933 | 34986 |
eswiki | 1766 | 2432 | 3406 | 3654 | 3848 | 3974 | 4092 | 4321 | 4524 | 4624 | 5851 |
fawiki | 444 | 609 | 989 | 1125 | 1199 | 1261 | 1319 | 1469 | 1537 | 1570 | 2442 |
frwiki | 1107 | 1464 | 1981 | 2117 | 2222 | 2284 | 2367 | 2438 | 2532 | 2573 | 3611 |
idwiki | 111 | 143 | 215 | 268 | 289 | 300 | 320 | 341 | 355 | 376 | 527 |
itwiki | 570 | 770 | 1021 | 1100 | 1134 | 1175 | 1229 | 1283 | 1329 | 1341 | 1820 |
jawiki | 670 | 1181 | 1893 | 1977 | 2056 | 2101 | 2176 | 2206 | 2255 | 2279 | 2691 |
ptwiki | 1048 | 1375 | 2017 | 2214 | 2323 | 2406 | 2482 | 2536 | 2587 | 2624 | 3361 |
ruwiki | 903 | 1196 | 1716 | 1815 | 1874 | 1922 | 1965 | 2049 | 2115 | 2148 | 2833 |
zhwiki | 336 | 501 | 790 | 873 | 913 | 961 | 1001 | 1052 | 1130 | 1182 | 1587 |
time_user_prev_rev_minutes = [1, 5, 15, 30, 60, 120, non_anon.time_user_prev_rev.max()/60]
time_user_prev_rev_time_intervals = [i*60 for i in time_user_prev_rev_minutes]
time_user_prev_rev_column_names = [f'{i} min' if i<=120 else 'max' for i in time_user_prev_rev_minutes]
time_user_prev_rev_median_risk = calculate_grouped(non_anon, time_user_prev_rev_time_intervals,
'time_user_prev_rev', column_names=time_user_prev_rev_column_names)
time_user_prev_rev_counts = calculate_grouped(non_anon, time_user_prev_rev_time_intervals,
'time_user_prev_rev', column_names=time_user_prev_rev_column_names, grp_function='count')
display_h({
'Median Risk': time_user_prev_rev_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': time_user_prev_rev_counts
})
1 min | 5 min | 15 min | 30 min | 60 min | 120 min | max | |
---|---|---|---|---|---|---|---|
wiki_db | |||||||
dewiki | 0.893 | 0.889 | 0.883 | 0.882 | 0.879 | 0.879 | 0.848 |
enwiki | 0.916 | 0.915 | 0.911 | 0.910 | 0.909 | 0.909 | 0.897 |
eswiki | 0.931 | 0.929 | 0.926 | 0.926 | 0.926 | 0.926 | 0.917 |
fawiki | 0.918 | 0.916 | 0.914 | 0.913 | 0.912 | 0.912 | 0.905 |
frwiki | 0.925 | 0.906 | 0.903 | 0.901 | 0.900 | 0.897 | 0.877 |
idwiki | 0.885 | 0.885 | 0.884 | 0.883 | 0.884 | 0.884 | 0.879 |
itwiki | 0.943 | 0.917 | 0.909 | 0.907 | 0.905 | 0.906 | 0.894 |
jawiki | 0.927 | 0.915 | 0.910 | 0.908 | 0.908 | 0.907 | 0.897 |
ptwiki | 0.931 | 0.925 | 0.922 | 0.921 | 0.921 | 0.921 | 0.908 |
ruwiki | 0.904 | 0.911 | 0.909 | 0.908 | 0.907 | 0.908 | 0.894 |
zhwiki | 0.897 | 0.909 | 0.906 | 0.904 | 0.903 | 0.902 | 0.890 |
1 min | 5 min | 15 min | 30 min | 60 min | 120 min | max | |
---|---|---|---|---|---|---|---|
wiki_db | |||||||
dewiki | 189 | 658 | 891 | 960 | 1003 | 1031 | 1571 |
enwiki | 4042 | 13345 | 17506 | 18766 | 19514 | 20040 | 25824 |
eswiki | 700 | 2344 | 3036 | 3242 | 3347 | 3429 | 4237 |
fawiki | 268 | 1046 | 1378 | 1470 | 1512 | 1533 | 2020 |
frwiki | 309 | 1217 | 1653 | 1785 | 1876 | 1937 | 2574 |
idwiki | 55 | 211 | 289 | 312 | 327 | 335 | 421 |
itwiki | 224 | 671 | 896 | 952 | 987 | 1010 | 1307 |
jawiki | 1045 | 1716 | 1920 | 1992 | 2013 | 2035 | 2266 |
ptwiki | 338 | 1235 | 1656 | 1774 | 1833 | 1867 | 2371 |
ruwiki | 244 | 992 | 1332 | 1423 | 1481 | 1512 | 1977 |
zhwiki | 237 | 722 | 927 | 980 | 1011 | 1031 | 1277 |
While resitricting improves the score, a susbsantial number of edits will be elimated for no significant benefit.
time_page_prev_rev_minutes = [1, 5, 15, 30, 60, init_criteria.time_page_prev_rev.max()/60]
time_page_prev_rev_time_intervals = [i*60 for i in time_page_prev_rev_minutes]
time_page_prev_rev_column_names = [f'{i} min' if i<=60 else 'max' for i in time_page_prev_rev_minutes]
time_page_prev_rev_median_risk = calculate_grouped(init_criteria, time_page_prev_rev_time_intervals,
'time_page_prev_rev', column_names=time_page_prev_rev_column_names)
time_page_prev_rev_counts = calculate_grouped(init_criteria, time_page_prev_rev_time_intervals,
'time_page_prev_rev', column_names=time_page_prev_rev_column_names, grp_function='count')
display_h({
'Median Risk': time_page_prev_rev_median_risk.style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': time_page_prev_rev_counts
})
1 min | 5 min | 15 min | 30 min | 60 min | max | |
---|---|---|---|---|---|---|
wiki_db | ||||||
dewiki | 0.932 | 0.918 | 0.913 | 0.911 | 0.910 | 0.902 |
enwiki | 0.922 | 0.916 | 0.913 | 0.912 | 0.912 | 0.911 |
eswiki | 0.932 | 0.927 | 0.924 | 0.923 | 0.923 | 0.923 |
fawiki | 0.943 | 0.931 | 0.927 | 0.927 | 0.927 | 0.916 |
frwiki | 0.934 | 0.919 | 0.915 | 0.913 | 0.912 | 0.903 |
idwiki | 0.913 | 0.911 | 0.908 | 0.907 | 0.908 | 0.902 |
itwiki | 0.934 | 0.926 | 0.922 | 0.921 | 0.920 | 0.920 |
jawiki | 0.916 | 0.892 | 0.887 | 0.885 | 0.883 | 0.876 |
ptwiki | 0.937 | 0.928 | 0.924 | 0.923 | 0.921 | 0.913 |
ruwiki | 0.928 | 0.923 | 0.920 | 0.918 | 0.918 | 0.914 |
zhwiki | 0.896 | 0.888 | 0.885 | 0.885 | 0.884 | 0.883 |
1 min | 5 min | 15 min | 30 min | 60 min | max | |
---|---|---|---|---|---|---|
wiki_db | ||||||
dewiki | 1440 | 3398 | 3987 | 4192 | 4411 | 16829 |
enwiki | 20828 | 50764 | 60095 | 63694 | 66835 | 172584 |
eswiki | 6968 | 17044 | 19714 | 20687 | 21526 | 55105 |
fawiki | 1086 | 3078 | 3671 | 3860 | 4021 | 9967 |
frwiki | 1906 | 5315 | 6332 | 6647 | 6912 | 19375 |
idwiki | 465 | 1073 | 1286 | 1370 | 1442 | 3554 |
itwiki | 3027 | 6706 | 7844 | 8220 | 8561 | 23440 |
jawiki | 1625 | 3389 | 3973 | 4217 | 4450 | 10170 |
ptwiki | 340 | 967 | 1235 | 1318 | 1377 | 3361 |
ruwiki | 2323 | 6326 | 7420 | 7792 | 8089 | 23587 |
zhwiki | 1035 | 2623 | 3168 | 3388 | 3581 | 7568 |
While resitricting improves the score, a susbsantial number of edits will be elimated for no significant benefit.
warnings.filterwarnings('ignore')
bytes_diff_intervals = [0, 1, 5, 10, 100, 500, 1000, 5000, init_criteria.rev_bytes_diff.abs().max()]
bytes_diff_column_labels = ['min'] + bytes_diff_intervals[1:-1] + ['max']
bytes_diff_median_risk = calculate_grouped(init_criteria, bytes_diff_intervals,
'rev_bytes_diff', column_names=bytes_diff_column_labels)
bytes_diff_counts = calculate_grouped(init_criteria, bytes_diff_intervals,
'rev_bytes_diff', column_names=bytes_diff_column_labels, grp_function='count')
display_h({
'Median Risk': bytes_diff_median_risk.fillna(0).style.background_gradient(cmap ='viridis_r').format("{:.3f}"),
'Number of Edits': bytes_diff_counts.fillna(0).astype(int)
})
min | 1 | 5 | 10 | 100 | 500 | 1000 | 5000 | max | |
---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||
dewiki | 0.901 | 0.905 | 0.912 | 0.912 | 0.915 | 0.968 | 0.983 | 0.993 | 0.000 |
enwiki | 0.910 | 0.912 | 0.915 | 0.915 | 0.917 | 0.965 | 0.978 | 0.986 | 0.000 |
eswiki | 0.922 | 0.924 | 0.928 | 0.929 | 0.943 | 0.978 | 0.984 | 0.943 | 0.963 |
fawiki | 0.916 | 0.917 | 0.920 | 0.920 | 0.921 | 0.951 | 0.973 | 0.978 | 0.000 |
frwiki | 0.903 | 0.905 | 0.908 | 0.908 | 0.916 | 0.974 | 0.983 | 0.992 | 0.000 |
idwiki | 0.902 | 0.905 | 0.906 | 0.908 | 0.919 | 0.976 | 0.983 | 0.979 | 0.000 |
itwiki | 0.917 | 0.919 | 0.921 | 0.921 | 0.928 | 0.978 | 0.987 | 0.995 | 0.000 |
jawiki | 0.868 | 0.871 | 0.875 | 0.876 | 0.896 | 0.961 | 0.974 | 0.979 | 0.000 |
ptwiki | 0.912 | 0.914 | 0.917 | 0.916 | 0.906 | 0.919 | 0.912 | 0.914 | 0.000 |
ruwiki | 0.913 | 0.915 | 0.919 | 0.921 | 0.931 | 0.974 | 0.983 | 0.990 | 0.000 |
zhwiki | 0.883 | 0.886 | 0.890 | 0.891 | 0.915 | 0.965 | 0.976 | 0.985 | 0.000 |
min | 1 | 5 | 10 | 100 | 500 | 1000 | 5000 | max | |
---|---|---|---|---|---|---|---|---|---|
wiki_db | |||||||||
dewiki | 16711 | 15566 | 12420 | 10723 | 3840 | 1491 | 894 | 232 | 0 |
enwiki | 171191 | 159106 | 131246 | 114246 | 42246 | 15488 | 9268 | 1944 | 0 |
eswiki | 54949 | 51473 | 41913 | 35805 | 12103 | 5167 | 3242 | 183 | 1 |
fawiki | 9857 | 9387 | 8041 | 7269 | 3135 | 1046 | 592 | 86 | 0 |
frwiki | 19263 | 18155 | 15031 | 13282 | 5303 | 2303 | 1537 | 430 | 0 |
idwiki | 3526 | 3261 | 2773 | 2397 | 824 | 303 | 168 | 31 | 0 |
itwiki | 22761 | 21010 | 16844 | 14480 | 4761 | 1756 | 1064 | 295 | 0 |
jawiki | 9659 | 8968 | 7751 | 6926 | 2999 | 1357 | 898 | 224 | 0 |
ptwiki | 3339 | 3153 | 2693 | 2446 | 1038 | 402 | 217 | 46 | 0 |
ruwiki | 23264 | 21810 | 18545 | 16677 | 7075 | 2975 | 1994 | 608 | 0 |
zhwiki | 7482 | 6622 | 5646 | 4869 | 1825 | 854 | 546 | 120 | 0 |
Restricting to have at least 5 bytes difference provides a good balance between the score and the number of edits
Based on the above results, we will incrementally apply additional restrictions
init_criteria['abs_bytes_diff'] = init_criteria['rev_bytes_diff'].abs()
def calculate_median_risk_and_count(df, criteria, time_to_revert_limit=12*60*60):
query_string = f"time_to_revert <= {time_to_revert_limit} " + ("& " + criteria if criteria else "")
filtered_df = df.query(query_string)
aggregated_df = filtered_df.groupby('wiki_db').agg({'risk': 'median', 'rev_id': 'count'})
aggregated_df.rename({'rev_id': 'n_edits', 'risk': 'median_risk'}, inplace=True, axis=1)
return aggregated_df.reset_index()
criteria_conditions = {
'Initial': init_criteria_risk,
'+ Reverted within 12 hours': '',
'+ User Edit Count <= 15 edits': "(is_anon == True) | (user_edit_count <= 15)",
'+ Time Since First Edit <= 48 hrs': "(is_anon == True) | ((user_edit_count <= 15) & (elapsed_first_rev < 48*60*60))",
'+ Absolute Bytes Diff >= 5 bytes': "(abs_bytes_diff >= 5) & ((is_anon == True) | ((user_edit_count <= 15) & (elapsed_first_rev < 48*60*60)))"
}
results = {label: calculate_median_risk_and_count(init_criteria, criteria) if label != 'Initial' \
else init_criteria_risk for label, criteria in criteria_conditions.items()}
display_h(results)
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.901974 | 16829 |
1 | enwiki | 0.910679 | 172584 |
2 | eswiki | 0.922596 | 55105 |
3 | fawiki | 0.916366 | 9967 |
4 | frwiki | 0.903316 | 19375 |
5 | idwiki | 0.902464 | 3554 |
6 | itwiki | 0.919648 | 23440 |
7 | jawiki | 0.875682 | 10170 |
8 | ptwiki | 0.913064 | 3361 |
9 | ruwiki | 0.914291 | 23587 |
10 | zhwiki | 0.883454 | 7568 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.904239 | 16077 |
1 | enwiki | 0.912205 | 162439 |
2 | eswiki | 0.923474 | 52922 |
3 | fawiki | 0.916792 | 9228 |
4 | frwiki | 0.905588 | 18401 |
5 | idwiki | 0.901994 | 3231 |
6 | itwiki | 0.921301 | 22077 |
7 | jawiki | 0.879789 | 9401 |
8 | ptwiki | 0.914363 | 3147 |
9 | ruwiki | 0.916403 | 22250 |
10 | zhwiki | 0.886989 | 6880 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.904503 | 16061 |
1 | enwiki | 0.912847 | 160889 |
2 | eswiki | 0.923850 | 52696 |
3 | fawiki | 0.918056 | 9136 |
4 | frwiki | 0.906304 | 18285 |
5 | idwiki | 0.902892 | 3190 |
6 | itwiki | 0.921365 | 22011 |
7 | jawiki | 0.880116 | 9109 |
8 | ptwiki | 0.916916 | 3079 |
9 | ruwiki | 0.916746 | 22204 |
10 | zhwiki | 0.887588 | 6819 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.907555 | 15468 |
1 | enwiki | 0.915196 | 153858 |
2 | eswiki | 0.924792 | 51696 |
3 | fawiki | 0.920468 | 8539 |
4 | frwiki | 0.909034 | 17489 |
5 | idwiki | 0.905071 | 3067 |
6 | itwiki | 0.922709 | 21633 |
7 | jawiki | 0.882525 | 8828 |
8 | ptwiki | 0.930669 | 2458 |
9 | ruwiki | 0.918103 | 21661 |
10 | zhwiki | 0.890380 | 6481 |
wiki_db | median_risk | n_edits | |
---|---|---|---|
0 | dewiki | 0.917214 | 11281 |
1 | enwiki | 0.920194 | 115997 |
2 | eswiki | 0.930483 | 39239 |
3 | fawiki | 0.924352 | 6734 |
4 | frwiki | 0.913709 | 13492 |
5 | idwiki | 0.910019 | 2361 |
6 | itwiki | 0.924533 | 15505 |
7 | jawiki | 0.883670 | 6679 |
8 | ptwiki | 0.934228 | 1855 |
9 | ruwiki | 0.923788 | 16914 |
10 | zhwiki | 0.896337 | 4813 |