import requests from bs4 import BeautifulSoup from collections import Counter import math import matplotlib.pyplot as plt; from matplotlib.ticker import FuncFormatter % matplotlib inline plt.rcdefaults() import numpy as np import itertools import statistics as stats import datetime raceNumber = {"M": 8364, "W": 8541} def scrap_page(gender, runners, p): page = requests.get("http://www.racetimer.se/en/race/resultlist/1997?checkpoint=9999&layout=marathon&page=" + str(p) + "&rc_id=" + str(raceNumber[gender])) soup = BeautifulSoup(page.text) if (soup == None): return table = soup.body.table.find(id='top3-list') lines = table.find_all('tr') if len(lines) == 0: return for line in lines: c = line.find_all('td') if len(c) == 0: continue r = {} r["rank"] = int(c[0].text) r["fullname"] = c[1].text.replace("ยป", "").strip() split = r["fullname"].rfind(" ") r["firstname"] = r["fullname"][:split].strip() r["lastname"] = r["fullname"][split+1:].strip() r["city"] = c[3].text.strip() # get a proper value for the year of birth try: r["yob"] = int(c[2].text.strip()) if c[2].text.strip() != "" else -1 except ValueError: r["yob"] = -1 r["nb"] = int(c[4].text.strip()) r["time"] = c[5].text.strip() # convert the time to seconds parts = r["time"].split(':') hours, minutes, seconds = 0, 0, 0 if len(parts) == 2: minutes, seconds = parts else: hours, minutes, seconds = parts r["timeInSeconds"] = int(hours) * 3600 + int(minutes) * 60 + int(seconds) runners[gender].append(r) runners = { "M": [], "W": [] } for i in range(1, 13): scrap_page("M", runners, i) scrap_page("W", runners, i) total_men = len(runners["M"]) total_women = len(runners["W"]) print("Men:", total_men) print("Women:", total_women) for g in runners.keys(): print(g, ": first in " + runners[g][0]["time"] + " , last in " + runners[g][-1]["time"]) top = 10 men = Counter([p["firstname"] for p in runners["M"]]) top_men = men.most_common(top) for i, m in enumerate(top_men): print(str(i+1) + ". " + m[0] + ": " + str(m[1])) women = Counter([p["firstname"] for p in runners["W"]]) top_women = women.most_common(top) for i, w in enumerate(top_women): print(str(i+1) + ". " + w[0] + ": " + str(w[1])) url = 'http://www.malmomilen.se/anmaelan/startlista-2014' page = requests.get(url) soup = BeautifulSoup(page.text) table = soup.body.table.find_all('tr') ps = {} for line in table: c = line.find_all('td') time = c[4].text if c[4].text != '' else 'N/A' if time not in ps: ps[time] = 0 ps[time] += 1 sorted_groups = ['Elitgruppen', 'Under 42 minuter', 'Under 45 minuter', '45-50 minuter', '50-55 minuter', '55-60 minuter', '60+ minuter', 'Barnloppet'] groups = [g[:8] for g in sorted_groups] y_pos = np.arange(len(groups))[::-1] n = [ps[group] for group in groups] rects = plt.barh(y_pos, n, align='center', alpha=0.5) plt.yticks(y_pos, sorted_groups) plt.xlabel('Number of participants') plt.title('Start group distribution') for i, rect in enumerate(rects): plt.text(0.95 * rect.get_width(), rect.get_y() + rect.get_height() / 2.0, ps[groups[i]], ha='right', va='center') plt.show() all_runners = runners["M"] + runners["W"] intervals = [(0, 40), (41, 42), (43, 45), (46, 50), (51, 55), (56, 60), (61, 200)] ids = [0] * 1000 for i, inter in enumerate(intervals): ids[inter[0]:inter[1]] = [i] * (inter[1] - inter[0] + 1) c = Counter([ids[int(math.ceil(r["timeInSeconds"] / 60))] for r in all_runners]) results = [cnt[1] for cnt in c.items()] # remove barnloppet for this analysis final_groups = sorted_groups[:-1] final_y_pos = np.arange(len(final_groups))[::-1] rects = plt.barh(final_y_pos, results, align='center', alpha=0.6) plt.yticks(final_y_pos, final_groups) plt.xlabel('Number of participants') plt.title('Final time distribution') for i, rect in enumerate(rects): plt.text(0.95 * rect.get_width(), rect.get_y() + rect.get_height() / 2.0, results[i], ha='right', va='center') plt.show() # remove runners without a proper year of birth weird_yob = [r for r in all_runners if r["yob"] == -1 or r["yob"] >= 2014 or r["yob"] < 1914] all_runners_yob = [r for r in all_runners if r not in weird_yob] print(str(len(weird_yob)) + " people have a weird year of birth ... Examples: " + str([r["yob"] for r in weird_yob[:15]])) group_by_yob = itertools.groupby(sorted(all_runners_yob, key=lambda x: x["yob"]), lambda x: x["yob"]) avg_by_yob = [(k, stats.mean([r["timeInSeconds"] for r in g])) for k, g in group_by_yob] def secondsToHumanTime(x, pos): return str(datetime.timedelta(seconds=x)) avg_by_yob = sorted(avg_by_yob, key=lambda x: x[0]) yob_x = [x[0] for x in avg_by_yob] yob_y = [x[1] for x in avg_by_yob] plt.xlabel('Year of birth') plt.ylabel('Average time per year of birth') plt.title('Variation of the time given the year of birth') plt.plot(yob_x, yob_y, 'bo') plt.gca().yaxis.set_major_formatter(FuncFormatter(secondsToHumanTime)) plt.show() group_by_yob = itertools.groupby(sorted(all_runners_yob, key=lambda x: x["yob"]), lambda x: x["yob"]) sum_by_yob = [(k, len(list(g))) for k, g in group_by_yob] sum_x = [x[0] for x in sum_by_yob] sum_y = [x[1] for x in sum_by_yob] plt.xlabel('Year of birth') plt.ylabel('Number of runners') plt.title('Number of runners per year of birth') plt.plot(sum_x, sum_y, 'bo', linewidth=2) plt.show() total_number = len(all_runners) total_sson = sum([1 if r["lastname"][-4:] == "sson" else 0 for r in all_runners]) print(str(total_sson) + " runners have a lastname finishing with \"sson\". Welcome to Sweden.") labels = 'Lastnames finishing in "sson"', 'Others' sizes = [total_sson, total_number - total_sson] colors = ['gold', 'lightskyblue'] explode = (0.1, 0) plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90) plt.axis('equal') plt.show()