%pylab inline %reset import datetime import numpy as np import pandas as pd import matplotlib as mpl import sys, os, re # Directory helpers BASE_DIR = os.getcwd() CHART_DIR = os.path.join(BASE_DIR, "charts") # Some me-specific constants SMS_FILE = "../backups/sms-20130317190417.csv" MY_NAME = "Jeremy" # Ultimately, let's consider 3am, instead of midnight, to be the end of the day. HOUR_SHIFT = 3 # Colors via http://colorbrewer2.com/ WHITE, LIGHT_GRAY, GRAY, BLACK = [ "#FFFFFF", "#E5E5E5", "#777777", "#000000" ] COLORS = [ "#FF6600", "#D7191C", "#2C7BB6", "#FFFFBF", "#ABD9E9" ] ORANGE, RED, BLUE, YELLOW, LIGHT_BLUE = COLORS HIGH_ALPHA = 0.9 MEDIUM_ALPHA = 0.5 LOW_ALPHA = 0.1 rgba = mpl.colors.colorConverter.to_rgba def set_styles(style_dict): """Set matplotlib styles from nested a nested dictionary""" for obj in style_dict: mpl.rc(obj, **style_dict[obj]) # Styles mostly derived from https://github.com/tonysyu/mpltools/blob/master/mpltools/style/ggplot.rc set_styles({ "figure": { "figsize": [ 12, 8 ], "facecolor": WHITE }, "savefig": { "dpi": 100, "bbox": "tight" }, "patch": { "linewidth": 0.5, "facecolor": ORANGE, "edgecolor": WHITE, "antialiased": True }, "font": { "size": 12 }, "legend": { "fontsize": 10 }, "axes": { "facecolor": LIGHT_GRAY, "edgecolor": WHITE, "linewidth": 1, "grid": True, "titlesize": "large", "labelsize": "large", "labelcolor": GRAY, "axisbelow": True, "color_cycle": COLORS }, "xtick": { "color": GRAY, "direction": "out" }, "ytick": { "color": GRAY, "direction": "out" }, "grid": { "color": WHITE, "linestyle": "-" } }) def savechart(filename): """Save currently open chart as a PNG to the chart directory.""" mpl.pyplot.savefig(os.path.join(CHART_DIR, filename + ".png"), bbox_inches='tight') close = mpl.pyplot.close def humancount(x): """Abbreviate a large number into something more readable.""" if x == 0: return "0" magnitude = int(np.log10(x)) thousands = magnitude / 3 divisor = pow(1000, thousands) divided = float(x)/divisor abbvs = [ "", "k", "m" ] return ((("%." + str(thousands) + "f") if divided % 1 > 0 else "%d") % (divided)) + abbvs[thousands] def get_step(max_val): """Given a max value, return a good step size for y-axis tick marks""" magnitude = int(np.log10(max_val)) step = pow(10, magnitude - 1) return step if max_val / step < 25 else step * 5 def pad_time_axis(axis): """Add a little extra space to the left and right of a date-based x-axis.""" old = axis.get_view_interval() axis.set_view_interval(old[0] - 0.5, old[1] + 0.5) COLS = ["contact_name", "date", "body", "type" ] msgs = pd.read_csv(os.path.join(BASE_DIR, SMS_FILE))[COLS] msgs = msgs[msgs["contact_name"] != "(Unknown)"] msgs = msgs[msgs["type"] < 3] msgs["datetime"] = msgs["date"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000)) FIRST_TIMESTAMP = msgs["datetime"].iget(0) LAST_TIMESTAMP = FIRST_TIMESTAMP + datetime.timedelta(days=365) msgs = msgs[msgs["datetime"] < LAST_TIMESTAMP] # Hour of day -- 12am as 0, 1am as 1, ..., 1pm as 13, ..., 11pm as 23 msgs["hour"] = msgs["datetime"].apply(lambda x: x.hour) # Day of week -- with Monday as 0, Tuesday as 1... msgs["weekday"] = msgs["datetime"].apply(lambda x: x.weekday()) # Most recent Monday msgs["week"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: x.date() - datetime.timedelta(days=x.weekday()))) # First day of month msgs["month"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: pd.Period(x, freq="M").to_timestamp())) DATE_RANGE, WEEK_RANGE, MONTH_RANGE = (pd.period_range(FIRST_TIMESTAMP, LAST_TIMESTAMP, freq=x) for x in ("D", "W-MON", "M")) msgs["direction"] = msgs["type"].apply(lambda x: "received" if x == 1 else "sent") msgs["chars"] = msgs["body"].apply(len).astype(int) msgs["160s"] = msgs["chars"].apply(lambda x: (x / 160) + 1) THINGS = { "chars": "Characters", "160s": "Messages" } sent, received = (msgs[msgs["direction"] == x] for x in ("sent", "received")) people = msgs.groupby("contact_name") people_directions = msgs.groupby([ "contact_name", "direction" ]) METRICS = { "160s": "messages", "chars": "characters" } GROUPINGS = { "all": msgs, "sent": sent, "received": received } TOTALS = dict((m, dict((key, GROUPINGS[key][m].sum()) for key in GROUPINGS)) for m in METRICS) def print_basic_stats(per_page=2000): n_days = (LAST_TIMESTAMP - FIRST_TIMESTAMP).days print "Between %s and %s ...\n" % tuple(x.strftime("%B %d, %Y") for x in (FIRST_TIMESTAMP, LAST_TIMESTAMP)) print "I exchanged text messages with %d known contacts.\n" % len(msgs["contact_name"].unique()) print "\n\n".join("\n".join([ "I sent %d *%s*, or %0.1f per day." % (TOTALS[m]["sent"], METRICS[m], 1.0*TOTALS[m]["sent"]/n_days), "I received %d, or %0.1f per day." % (TOTALS[m]["received"], 1.0*TOTALS[m]["received"]/n_days), "That's %d in total, or %0.1f per day." % (TOTALS[m]["all"], 1.0*TOTALS[m]["all"]/n_days), "For every 100 %s I sent, I received about %d." % (METRICS[m], round(100.0 * TOTALS[m]["received"] / TOTALS[m]["sent"], 0)), ]) for m in METRICS) + "\n" print "Given ~%d characters per printed page, I wrote ~%d pages' worth of text messages during this time.\n" % \ (per_page, TOTALS["chars"]["sent"]/per_page) print "I sent %d chars/msg. I received %d chars/msg.\n" % tuple(round(TOTALS["chars"][x] * 1.0 / TOTALS["160s"][x], 0) for x in ("sent", "received")) by_date = pd.DataFrame({ "date": sent["datetime"].apply(lambda x: x.date()), "160s": sent["160s"], "chars": sent["chars"] }).groupby("date") top_msgs, top_chars = (by_date[m].sum().order(ascending=False).reset_index().values[0] for m in ("160s", "chars")) print "The most messages I ever sent in a day was %d, on %s.\n" % (top_msgs[1], top_msgs[0].strftime("%B %d, %Y")) print "The most total characters I ever sent was %d, on %s.\n" % (top_chars[1], top_chars[0].strftime("%B %d, %Y")) print_basic_stats() def scatter_all(contact_name=False, color_direction=False): # Alter color scheme if highlighting a particular contact def get_color(name, direction): is_focus = not contact_name or contact_name == name if color_direction: if is_focus:return rgba((BLUE if direction == "sent" else RED), MEDIUM_ALPHA) else: return rgba(WHITE, 0) else: if is_focus: return rgba(ORANGE, MEDIUM_ALPHA) else: return rgba(GRAY, LOW_ALPHA) colors = [ get_color(name, direction) for name, direction in msgs[["contact_name", "direction"]].values ] # Measure vertical axis in seconds since the day "began" def to_seconds(x): shifted = x - datetime.timedelta(hours=HOUR_SHIFT) morn = datetime.datetime.combine(shifted, datetime.time(0)) return (shifted - morn).seconds days = msgs["datetime"].apply(lambda x: (x - datetime.timedelta(hours=HOUR_SHIFT)).date()) seconds = msgs["datetime"].apply(to_seconds) # Create and label plot close() plot = mpl.pyplot.scatter( days, seconds, c = colors, marker = "x", linewidth = 1.25 if contact_name else 1, norm = True, s = 20 ).axes t1 = "Every Message" if not contact_name else "Every Message To/From %s" % contact_name t2 = "From %s in Blue, To %s in Red" % (MY_NAME, MY_NAME) if color_direction else None plot.set_title(u" • ".join(x for x in [ t1, t2 ] if x) + "\n") # Set x-axis details plot.set_xticks(MONTH_RANGE.to_datetime()) plot.set_xticklabels(list(x.strftime("%b\n%Y") for x in MONTH_RANGE.to_datetime())) # Set y-axis details TOTAL_SECONDS = 60 * 60 * 24 SECONDS_TO_SHOW = range(0, TOTAL_SECONDS + 1, 60 * 60) HOURS_TO_SHOW = list(datetime.time(((x / 3600) + HOUR_SHIFT) % 24) for x in SECONDS_TO_SHOW) plot.set_ylim(TOTAL_SECONDS * -0.01, TOTAL_SECONDS * 1.01) plot.set_yticks(SECONDS_TO_SHOW) plot.set_yticklabels(list(x.strftime("%I %p").lstrip("0") for x in HOURS_TO_SHOW)) scatter_all() savechart("all-messages") scatter_all(color_direction=True) savechart("all-messages-directional") def save_contact_scatters(threshold=20): replacer = re.compile("[^\.a-z0-9]+") above_threshold = [ name for name, msgs in people if len(msgs) > threshold ] count = len(above_threshold) print "Generating charts for %d people." % count for i in range(count): name = above_threshold[i] filename = re.sub(replacer, "-", name.lower()) # print "Generating %d of %d" % (i + 1, len(above_threshold)) scatter_all(name) savechart("personal/%s-AMONG" % filename) scatter_all(name, color_direction=True) savechart("personal/%s-SOLO" % filename) close() save_contact_scatters() def plot_over_time(col, metric="160s", annotations = []): """Plot the number of messages sent + received for each week or month.""" # Note: slicing off first and last (partial) weeks. all_counts, sent_counts, received_counts = (selection.groupby(col)[metric].sum()[1:-1] for selection in (msgs, sent, received)) MAX_COUNT = int(max(all_counts)) # Create and label plot close() plot = all_counts.plot(lw=2, label="All") sent_counts.plot(color=rgba(BLUE, 0.5), label="Sent") received_counts.plot(color=rgba(RED, 0.5), label="Received") ything = THINGS[metric] plot.set_title("%s per %s" % (ything, col.capitalize()) + (" (Begins on Monday)" if col == "week" else "") + "\n") plot.set_xlabel("") plot.set_ylabel(ything + " Sent + Received\n") leg = plot.legend(loc='upper left') leg.get_frame().set_alpha(0) # Set ticks step = get_step(MAX_COUNT) yticks = range(0, int(MAX_COUNT + 1.1) + step, step) plot.set_yticks(yticks) plot.set_yticklabels(list(humancount(y) for y in yticks)) plot.set_ylim(0, int(MAX_COUNT * 1.1)) pad_time_axis(plot.xaxis) def annotate_week (week, text, adjust_x = 0, adjust_y = 0): """Add an annotation to any week.""" timestamp = pd.Timestamp(week) plot.annotate(text, xy=(timestamp, all_counts[timestamp]), xytext=(timestamp + datetime.timedelta(weeks=adjust_x), all_counts[timestamp] + adjust_y * MAX_COUNT / 100), horizontalalignment="center", arrowprops=dict(arrowstyle="-|>", color="black")) for a in annotations: annotate_week(*a) plot_over_time("month", metric="160s") savechart("monthly-messages") plot_over_time("month", metric="chars") savechart("monthly-characters") annotations = [ ("2012-10-29", "Hurricane Sandy", -6, 0), ("2012-12-24", "Home for\nthe holidays.", 0, 10), ("2013-01-28", "Advised\nfriend on a\n relationship.", 0, 10) ] plot_over_time("week", metric="160s", annotations=annotations) savechart("weekly-messages-labeled") plot_over_time("week", metric="chars", annotations=annotations) savechart("weekly-characters-labeled") def plot_weekdaily(metric="160s"): """Plot the average number of messages sent + received per day of the week.""" # Calculate the number of each weekday in our date range, in order to calculate the averages. weekday_counts = pd.Series(DATE_RANGE.to_datetime()).apply(lambda x: x.weekday()).value_counts() all_per_day, sent_per_day, received_per_day = (1.0 * selection.groupby("weekday")[metric].sum() / weekday_counts for selection in (msgs, sent, received)) # Create and label plot close() plot = all_per_day.plot(lw=2, marker=".", markersize=10, label="All") sent_per_day.plot(marker=".", markersize=5, label="Sent", c=rgba(BLUE, 0.5)) received_per_day.plot(marker=".", markersize=5, label="Received", c=rgba(RED, 0.5)) plot.margins(0.1, 0.1) thing = THINGS[metric] plot.set_title("Average # of " + thing + " by Day of Week\n") plot.set_xlabel("\nDay of Week") plot.set_ylabel("Average # of " + thing + "\n") leg = plot.legend(loc='upper left') leg.get_frame().set_alpha(0) # Set x-axis details plot.set_xticks(range(0, len(pd.datetools.DAYS))) plot.set_xticklabels(pd.Series(pd.datetools.DAYS[x] for x in range(7))) # Set y-axis details MAX_COUNT = all_per_day.max() step = get_step(MAX_COUNT) plot.set_ylim(0, MAX_COUNT + 1.1) plot.set_yticks(range(0, int(MAX_COUNT * 1.1) + 1, step)) plot_weekdaily() savechart("weekdaily-messages") plot_weekdaily("chars") savechart("weekdaily-characters") def plot_hourly(metric="160s", secondary_metric=False): def get_pcts(metric): hourly_counts = msgs.groupby("hour")[metric].sum() total = hourly_counts.sum() hourly_pcts = 1.0 * hourly_counts / total return hourly_pcts def fill_hours(data): return pd.Series(list(data[x] if x in data else 0 for x in all_hours), index = all_hours) # Fill in 0s for hour(s) with no SMSes all_hours = range(HOUR_SHIFT, 24) + range(0, HOUR_SHIFT) hourly_pcts = get_pcts(metric) all_hourly_pcts = fill_hours(hourly_pcts) # Create and label plot close() plot = all_hourly_pcts.plot(lw=2, marker=".", markersize=10, label="Messages") fill_hours(get_pcts(secondary_metric)).plot(c=rgba(BLACK, 0.25), label="Characters") plot.set_title("Hourly Distribution\n") plot.set_xlabel("Hour Beginning at...") plot.set_ylabel("Percentage of Total Sent + Received") leg = plot.legend(loc='upper left') leg.get_frame().set_alpha(0) # Set x-axis details plot.set_xlim(min(all_hours) - 0.5, max(all_hours) + 0.5) xticks = plot.set_xticks(all_hours) for x in xticks: x.label.set_fontsize(10) plot.set_xticklabels(list(datetime.datetime.strptime(str((x + HOUR_SHIFT) % 24), "%H").strftime("%I%p").lstrip("0").lower().rstrip("m") for x in all_hours)) # Set y-axis details max_pct = hourly_pcts.max() plot.set_ylim(-max_pct * 0.05, max_pct * 1.05) yticks = np.arange(0, max_pct * 1.1, 0.01) plot.set_yticks(yticks) plot.set_yticklabels(list("%d%%" % (y * 100) for y in yticks)) plot_hourly(metric="160s", secondary_metric="chars") savechart("hourly") def plot_monthly_uniques(): sent = msgs[msgs.direction == "sent"] monthly_uniques = sent.groupby("month")["contact_name"].agg(lambda x: len(x.unique()))[1:-1] close() monthly_plot = monthly_uniques.plot(marker=".", markersize=10, lw=2) monthly_plot.set_title("Unique SMS Recipients per Month\n") monthly_plot.set_xlabel("") monthly_plot.set_ylabel("Number of Unique SMS Recipients\n") monthly_plot.set_ylim(0, monthly_uniques.max() * 1.1) monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5) plot_monthly_uniques() savechart("monthly-uniques") def plot_diversity(): def simpson(x): numerator = sum(pow(y, 2) for y in x.value_counts()) * 1.0 denominator = pow(len(x), 2) return 1 - numerator / denominator sent = msgs[msgs.direction == "sent"] monthly_diversity = sent.groupby("month")["contact_name"].agg(simpson)[1:-1] close() monthly_plot = monthly_diversity.plot(marker=".", markersize=10, lw=2) monthly_plot.set_title("Simpson Diversity Index of SMS Recipients by Month\n") monthly_plot.set_xlabel("") monthly_plot.set_ylabel("Diversity (Higher = Greater Diversity of Recipients)\n") monthly_plot.set_ylim(0, 1) monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5) monthly_plot.set_yticks(list(float(x) / 10 for x in range(0, 11))) plot_diversity() savechart("diversity") def scatterplot_contacts(metric="160s", size=50, thing="Total Messages"): counts = people_directions[metric].sum() counts_unstacked = counts.unstack("direction").fillna(0) max_val = counts.max() step = get_step(max_val) max_val_bleed = max_val * 1.05 close() plot = mpl.pyplot.scatter(counts_unstacked.sent, counts_unstacked.received, s=size, c=ORANGE, alpha=0.9, zorder=2) mpl.pyplot.plot([-max_val_bleed, max_val_bleed], [-max_val_bleed, max_val_bleed], c=LIGHT_GRAY, zorder=1) plot.figure.set_size_inches(8, 8) axes = plot.axes axes.set_title(thing + " Sent/Received per Contact\n") axes.set_xlabel("\n" + thing + " Sent to Contact") axes.set_ylabel(thing + " Received from Contact\n") axes.set_xlim(-max_val_bleed / 20, max_val_bleed) axes.set_ylim(-max_val_bleed / 20, max_val_bleed) ticks = np.arange(0, max_val_bleed, step) labels = list(humancount(x) for x in ticks) axes.set_xticks(ticks) axes.set_yticks(ticks) axes.set_xticklabels(labels) axes.set_yticklabels(labels) def save_sent_vs_received_scatterplots(): # All the bizarre math here is to exaggerate differences between average message lengths. avg_msg_len = pow(people["chars"].sum() / people["160s"].sum(), 2) / 30 scatterplot_contacts("160s", size=avg_msg_len, thing="Total Messages") savechart("personal/sent-vs-received-msgs") scatterplot_contacts("chars", size=avg_msg_len, thing="Total Characters") savechart("personal/sent-vs-received-chars") close() save_sent_vs_received_scatterplots() def plot_message_hist (df, title, ything): counts = df.groupby("contact_name")["160s"].sum() n_people = len(msgs["contact_name"].unique()) pct95 = (int(np.percentile(counts, 95)) / 10) * 10 trimmed = counts.apply(lambda x: min([x, pct95])) step = get_step(pct95) bins = range(0, pct95 + step, step) # Create and label histogram close() plot = trimmed.hist(bins=bins) plot.set_title(title + "\n") plot.set_xlabel("# Of Messages") plot.set_ylabel("# Of %s" % (ything)) # Set x-axis and labels plot.set_xticks(bins) plot.set_xticklabels(bins[:-1] + ["%d+" % (bins[-1])]) # Set secondary y-axis and labels yticks = plot.get_yticks() y2 = plot.twinx() y2.grid(False) y2.set_yticks(yticks) y2.set_yticklabels(list("%.1f%%" % (100.0 * y / n_people) for y in yticks)) y2.set_ylabel("Percentage of All Contacts", rotation=-90) plot_message_hist(msgs, "Distribution of Total Messages Exchanged per Person", "People") savechart("messages-per-person") def plot_message_lengths(direction=False): to_analyze = msgs[msgs["direction"] == direction] if direction else msgs # Break up messages longer than 160 characters into chunks of 160 characters or less chunk_into_160s = lambda x: [ 160 ] * (x/160) + [ x%160 ] if x > 160 else [ x ] adjusted_chars = pd.Series(to_analyze["chars"].apply(chunk_into_160s).sum()) # Define histogram bins and calculate max bar height bins = range(0, 170, 10) counts, divisions = np.histogram(adjusted_chars, bins=bins) total = len(adjusted_chars) max_val = counts.max() * 1.0 / total # Create plot and set axes close() hist = adjusted_chars.hist(bins=bins) hist.set_xticks(bins) yticks = np.arange(0, max_val + 0.01, 0.01) hist.set_yticks(yticks * total) hist.set_yticklabels(list("%d%%" % (y * 100) for y in yticks)) # Set labels dir_string = str(direction or "all").capitalize() hist.set_title(u"Message Length by Number of Characters • %s Messages\n" % dir_string) hist.set_xlabel("\nMessage Length (# of Characters)") hist.set_ylabel("Percentage of %s Messages\n" % dir_string) plot_message_lengths() savechart("message-lengths-all") plot_message_lengths("sent") savechart("message-lengths-sent") plot_message_lengths("received") savechart("message-lengths-received") def plot_words(word_dict, exp=2): def scaler(x, rev=False): return pow(x, pow(1.0/exp, -1 if rev else 1)) words = pd.Series(word_dict) bodies = msgs.groupby("direction")["body"].agg(lambda x: " /// ".join(x)) chars_received, chars_sent = msgs.groupby("direction")["body"].agg(sum).apply(len) count_received, count_sent = (words.apply(lambda regex: len(regex.findall(body))) for body in bodies) freq_received, freq_sent = (scaler(x) for x in (1.0 * count_received / chars_received, 1.0 * count_sent / chars_sent)) sent_more = (freq_sent > freq_received) * 1 word_df = pd.DataFrame({ "regex": words, "freq_sent": freq_sent, "freq_received": freq_received, "sent_more": sent_more, "rank_received": (1 * (sent_more * (freq_received + 1))).rank(method="first"), "rank_sent": (1 * ((sent_more^1) * (freq_sent + 1))).rank(method="first") }).reset_index() close() plot = mpl.pyplot.scatter( word_df["freq_sent"], word_df["freq_received"], color=ORANGE, s=50).axes plot.figure.set_size_inches(10, 10) lim = max(max(freq_received), max(freq_sent)) plot.set_xlim(-lim*0.01, lim*1.05) plot.set_ylim(-lim*0.01, lim*1.05) mpl.pyplot.plot([ -1, 1 ], [ -1, 1 ], color=rgba(BLACK, 0.5)) top = lim n = len(sent_more) nright = sum(sent_more) ntop = n - nright def budge(row): x = top if row["sent_more"] else (row["rank_sent"] - (nright + 1)) * top / ntop y = (row["rank_received"] - (ntop + 1)) * top / nright if row["sent_more"] else top return (x, y) def annotate(row): bbox_props = dict(boxstyle="round,pad=0.3", fc=WHITE, ec=rgba(BLACK, 0.25)) plot.annotate(row["index"], xy=(row["freq_sent"], row["freq_received"]), xytext=budge(row), arrowprops=(dict(arrowstyle="-|>", color=rgba(BLACK, 0.5))), bbox=bbox_props) for x in word_df.index: row = word_df.ix[x] annotate(row) ticks = [ 0 ] + list(scaler(1 * pow(10, -x)) for x in range(2, 6)) labels = list(int(round(x)) if round(x, 3)%1 == 0 else x for x in list(scaler(x, rev=True) * 10000 for x in ticks)) plot.set_xticks(ticks) plot.set_xticklabels(labels) plot.set_yticks(ticks) plot.set_yticklabels(labels) plot.set_title("Frequency of Specific Words/Characters\n") plot.set_xlabel("\n# Sent per 10,000 Characters") plot.set_ylabel("# Received per 10,000 Characters") plot_words({ "!": re.compile("\!"), "?": re.compile("\?"), "; (semicolon)": re.compile("; "), ", (comma)": re.compile(", "), # ". (period)": re.compile("\."), "oops": re.compile("oops", re.IGNORECASE), "yikes": re.compile("yikes", re.IGNORECASE), "no prob*": re.compile("no prob*", re.IGNORECASE), ":)": re.compile(":.?\)"), ":(": re.compile(":.?\("), "I": re.compile(" I(?![A-Za-z])"), "i": re.compile(" i(?![A-Za-z])"), "oops": re.compile("oops", re.IGNORECASE), "no/nope": re.compile("(no|nope)(?![a-z])", re.IGNORECASE), "yes": re.compile("yes(?!a-z])", re.IGNORECASE), "u": re.compile(" u(?![A-Za-z])"), "you": re.compile("you(?!a-z])", re.IGNORECASE), "(s)he": re.compile(" s?he ", re.IGNORECASE), # "my": re.compile(" my ", re.IGNORECASE), "thank*": re.compile("thanks?(?![a-z])", re.IGNORECASE), "please": re.compile("please", re.IGNORECASE), "sorry": re.compile("sorry", re.IGNORECASE), "congrat*": re.compile("congrat", re.IGNORECASE) }, exp=2) savechart("word-frequency")