Load in pickled preprocessed data, separate into train and test split and train Logistic Regression
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import FCPython
import pickle
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
import Metrica_Functions_TLMAnalytics as mfun
# Get competition ids for relevant competitions
comps = pd.read_json('open-data-master/data/competitions.json')
male_comps = comps[comps['competition_gender'] == 'male']
male_comps_id = list(male_comps['competition_id'].unique())
male_comps_id
[16, 43, 11, 2]
# For those competitions, identify the match ids
male_df_list = []
for comp in male_comps_id:
json_files = [x for x in os.listdir('open-data-master/data/matches/' + str(comp)) if 'json' in x]
for event_json in json_files:
male_df = pd.read_json('open-data-master/data/matches/' + str(comp) + '/' + event_json)
male_df_list.append(male_df)
male_df = pd.concat(male_df_list)
male_matches = list(male_df['match_id'].unique())
# For those matches, load the events and append to single dataframe
json_files = [int(x.split('.')[0]) for x in os.listdir('open-data-master/data/events') if 'json' in x]
male_matches = [x for x in json_files if x in male_matches]
df_list = []
count = 1
for event_json in male_matches:
print('Loading: {}. {} / {}'.format(event_json, count, len(male_matches)))
df = pd.read_json('open-data-master/data/events/' + str(event_json) + '.json')
df['match_id'] = event_json
df_list.append(df)
count +=1
df = pd.concat(df_list)
Loading: 15946. 1 / 596 Loading: 15956. 2 / 596 Loading: 15973. 3 / 596 Loading: 15978. 4 / 596 Loading: 15986. 5 / 596 Loading: 15998. 6 / 596 Loading: 16010. 7 / 596 Loading: 16023. 8 / 596 Loading: 16029. 9 / 596 Loading: 16056. 10 / 596 Loading: 16073. 11 / 596 Loading: 16079. 12 / 596 Loading: 16086. 13 / 596 Loading: 16095. 14 / 596 Loading: 16109. 15 / 596 Loading: 16120. 16 / 596 Loading: 16131. 17 / 596 Loading: 16136. 18 / 596 Loading: 16149. 19 / 596 Loading: 16157. 20 / 596 Loading: 16173. 21 / 596 Loading: 16182. 22 / 596 Loading: 16190. 23 / 596 Loading: 16196. 24 / 596 Loading: 16205. 25 / 596 Loading: 16215. 26 / 596 Loading: 16231. 27 / 596 Loading: 16240. 28 / 596 Loading: 16248. 29 / 596 Loading: 16265. 30 / 596 Loading: 16275. 31 / 596 Loading: 16289. 32 / 596 Loading: 16306. 33 / 596 Loading: 16317. 34 / 596 Loading: 18235. 35 / 596 Loading: 18236. 36 / 596 Loading: 18237. 37 / 596 Loading: 18240. 38 / 596 Loading: 18241. 39 / 596 Loading: 18242. 40 / 596 Loading: 18243. 41 / 596 Loading: 18244. 42 / 596 Loading: 18245. 43 / 596 Loading: 22912. 44 / 596 Loading: 2302764. 45 / 596 Loading: 265830. 46 / 596 Loading: 265835. 47 / 596 Loading: 265837. 48 / 596 Loading: 265839. 49 / 596 Loading: 265857. 50 / 596 Loading: 265866. 51 / 596 Loading: 265894. 52 / 596 Loading: 265896. 53 / 596 Loading: 265918. 54 / 596 Loading: 265944. 55 / 596 Loading: 265952. 56 / 596 Loading: 265958. 57 / 596 Loading: 265963. 58 / 596 Loading: 266015. 59 / 596 Loading: 266033. 60 / 596 Loading: 266045. 61 / 596 Loading: 266056. 62 / 596 Loading: 266066. 63 / 596 Loading: 266074. 64 / 596 Loading: 266106. 65 / 596 Loading: 266117. 66 / 596 Loading: 266142. 67 / 596 Loading: 266148. 68 / 596 Loading: 266149. 69 / 596 Loading: 266160. 70 / 596 Loading: 266166. 71 / 596 Loading: 266191. 72 / 596 Loading: 266201. 73 / 596 Loading: 266230. 74 / 596 Loading: 266236. 75 / 596 Loading: 266240. 76 / 596 Loading: 266254. 77 / 596 Loading: 266256. 78 / 596 Loading: 266273. 79 / 596 Loading: 266274. 80 / 596 Loading: 266280. 81 / 596 Loading: 266299. 82 / 596 Loading: 266310. 83 / 596 Loading: 266320. 84 / 596 Loading: 266357. 85 / 596 Loading: 266406. 86 / 596 Loading: 266420. 87 / 596 Loading: 266424. 88 / 596 Loading: 266433. 89 / 596 Loading: 266440. 90 / 596 Loading: 266462. 91 / 596 Loading: 266467. 92 / 596 Loading: 266477. 93 / 596 Loading: 266490. 94 / 596 Loading: 266491. 95 / 596 Loading: 266498. 96 / 596 Loading: 266516. 97 / 596 Loading: 266525. 98 / 596 Loading: 266528. 99 / 596 Loading: 266531. 100 / 596 Loading: 266557. 101 / 596 Loading: 266560. 102 / 596 Loading: 266603. 103 / 596 Loading: 266613. 104 / 596 Loading: 266620. 105 / 596 Loading: 266631. 106 / 596 Loading: 266653. 107 / 596 Loading: 266664. 108 / 596 Loading: 266669. 109 / 596 Loading: 266670. 110 / 596 Loading: 266724. 111 / 596 Loading: 266731. 112 / 596 Loading: 266741. 113 / 596 Loading: 266770. 114 / 596 Loading: 266794. 115 / 596 Loading: 266815. 116 / 596 Loading: 266827. 117 / 596 Loading: 266838. 118 / 596 Loading: 266846. 119 / 596 Loading: 266871. 120 / 596 Loading: 266874. 121 / 596 Loading: 266883. 122 / 596 Loading: 266885. 123 / 596 Loading: 266892. 124 / 596 Loading: 266916. 125 / 596 Loading: 266921. 126 / 596 Loading: 266929. 127 / 596 Loading: 266952. 128 / 596 Loading: 266961. 129 / 596 Loading: 266967. 130 / 596 Loading: 266986. 131 / 596 Loading: 266989. 132 / 596 Loading: 267039. 133 / 596 Loading: 267058. 134 / 596 Loading: 267076. 135 / 596 Loading: 267077. 136 / 596 Loading: 267085. 137 / 596 Loading: 267101. 138 / 596 Loading: 267138. 139 / 596 Loading: 267183. 140 / 596 Loading: 267192. 141 / 596 Loading: 267197. 142 / 596 Loading: 267212. 143 / 596 Loading: 267220. 144 / 596 Loading: 267273. 145 / 596 Loading: 267274. 146 / 596 Loading: 267301. 147 / 596 Loading: 267327. 148 / 596 Loading: 267343. 149 / 596 Loading: 267368. 150 / 596 Loading: 267373. 151 / 596 Loading: 267395. 152 / 596 Loading: 267400. 153 / 596 Loading: 267422. 154 / 596 Loading: 267432. 155 / 596 Loading: 267464. 156 / 596 Loading: 267492. 157 / 596 Loading: 267499. 158 / 596 Loading: 267502. 159 / 596 Loading: 267506. 160 / 596 Loading: 267520. 161 / 596 Loading: 267533. 162 / 596 Loading: 267561. 163 / 596 Loading: 267567. 164 / 596 Loading: 267569. 165 / 596 Loading: 267576. 166 / 596 Loading: 267590. 167 / 596 Loading: 267596. 168 / 596 Loading: 267597. 169 / 596 Loading: 267611. 170 / 596 Loading: 267660. 171 / 596 Loading: 267670. 172 / 596 Loading: 267675. 173 / 596 Loading: 303377. 174 / 596 Loading: 303400. 175 / 596 Loading: 303421. 176 / 596 Loading: 303430. 177 / 596 Loading: 303451. 178 / 596 Loading: 303470. 179 / 596 Loading: 303473. 180 / 596 Loading: 303479. 181 / 596 Loading: 303487. 182 / 596 Loading: 303493. 183 / 596 Loading: 303504. 184 / 596 Loading: 303516. 185 / 596 Loading: 303517. 186 / 596 Loading: 303524. 187 / 596 Loading: 303532. 188 / 596 Loading: 303548. 189 / 596 Loading: 303596. 190 / 596 Loading: 303600. 191 / 596 Loading: 303610. 192 / 596 Loading: 303615. 193 / 596 Loading: 303634. 194 / 596 Loading: 303652. 195 / 596 Loading: 303664. 196 / 596 Loading: 303666. 197 / 596 Loading: 303674. 198 / 596 Loading: 303680. 199 / 596 Loading: 303682. 200 / 596 Loading: 303696. 201 / 596 Loading: 303700. 202 / 596 Loading: 303707. 203 / 596 Loading: 303715. 204 / 596 Loading: 303725. 205 / 596 Loading: 303731. 206 / 596 Loading: 3749052. 207 / 596 Loading: 3749068. 208 / 596 Loading: 3749079. 209 / 596 Loading: 3749133. 210 / 596 Loading: 3749153. 211 / 596 Loading: 3749192. 212 / 596 Loading: 3749196. 213 / 596 Loading: 3749233. 214 / 596 Loading: 3749246. 215 / 596 Loading: 3749253. 216 / 596 Loading: 3749257. 217 / 596 Loading: 3749276. 218 / 596 Loading: 3749278. 219 / 596 Loading: 3749296. 220 / 596 Loading: 3749310. 221 / 596 Loading: 3749346. 222 / 596 Loading: 3749358. 223 / 596 Loading: 3749360. 224 / 596 Loading: 3749403. 225 / 596 Loading: 3749431. 226 / 596 Loading: 3749434. 227 / 596 Loading: 3749448. 228 / 596 Loading: 3749453. 229 / 596 Loading: 3749454. 230 / 596 Loading: 3749462. 231 / 596 Loading: 3749465. 232 / 596 Loading: 3749493. 233 / 596 Loading: 3749522. 234 / 596 Loading: 3749526. 235 / 596 Loading: 3749528. 236 / 596 Loading: 3749552. 237 / 596 Loading: 3749603. 238 / 596 Loading: 3749642. 239 / 596 Loading: 3750200. 240 / 596 Loading: 3750201. 241 / 596 Loading: 3752619. 242 / 596 Loading: 68313. 243 / 596 Loading: 68314. 244 / 596 Loading: 68315. 245 / 596 Loading: 68316. 246 / 596 Loading: 68317. 247 / 596 Loading: 68318. 248 / 596 Loading: 68319. 249 / 596 Loading: 68320. 250 / 596 Loading: 68321. 251 / 596 Loading: 68322. 252 / 596 Loading: 68323. 253 / 596 Loading: 68324. 254 / 596 Loading: 68325. 255 / 596 Loading: 68326. 256 / 596 Loading: 68327. 257 / 596 Loading: 68328. 258 / 596 Loading: 68329. 259 / 596 Loading: 68330. 260 / 596 Loading: 68331. 261 / 596 Loading: 68332. 262 / 596 Loading: 68333. 263 / 596 Loading: 68334. 264 / 596 Loading: 68335. 265 / 596 Loading: 68336. 266 / 596 Loading: 68339. 267 / 596 Loading: 68340. 268 / 596 Loading: 68341. 269 / 596 Loading: 68342. 270 / 596 Loading: 68347. 271 / 596 Loading: 68348. 272 / 596 Loading: 68350. 273 / 596 Loading: 68351. 274 / 596 Loading: 68352. 275 / 596 Loading: 68353. 276 / 596 Loading: 68354. 277 / 596 Loading: 68356. 278 / 596 Loading: 68358. 279 / 596 Loading: 68359. 280 / 596 Loading: 68360. 281 / 596 Loading: 68361. 282 / 596 Loading: 68363. 283 / 596 Loading: 68364. 284 / 596 Loading: 68365. 285 / 596 Loading: 68366. 286 / 596 Loading: 69138. 287 / 596 Loading: 69139. 288 / 596 Loading: 69141. 289 / 596 Loading: 69142. 290 / 596 Loading: 69143. 291 / 596 Loading: 69144. 292 / 596 Loading: 69145. 293 / 596 Loading: 69146. 294 / 596 Loading: 69147. 295 / 596 Loading: 69148. 296 / 596 Loading: 69149. 297 / 596 Loading: 69151. 298 / 596 Loading: 69153. 299 / 596 Loading: 69154. 300 / 596 Loading: 69155. 301 / 596 Loading: 69156. 302 / 596 Loading: 69157. 303 / 596 Loading: 69158. 304 / 596 Loading: 69159. 305 / 596 Loading: 69160. 306 / 596 Loading: 69162. 307 / 596 Loading: 69164. 308 / 596 Loading: 69165. 309 / 596 Loading: 69166. 310 / 596 Loading: 69169. 311 / 596 Loading: 69170. 312 / 596 Loading: 69171. 313 / 596 Loading: 69172. 314 / 596 Loading: 69173. 315 / 596 Loading: 69174. 316 / 596 Loading: 69175. 317 / 596 Loading: 69176. 318 / 596 Loading: 69177. 319 / 596 Loading: 69178. 320 / 596 Loading: 69179. 321 / 596 Loading: 69180. 322 / 596 Loading: 69181. 323 / 596 Loading: 69182. 324 / 596 Loading: 69183. 325 / 596 Loading: 69184. 326 / 596 Loading: 69185. 327 / 596 Loading: 69186. 328 / 596 Loading: 69187. 329 / 596 Loading: 69189. 330 / 596 Loading: 69195. 331 / 596 Loading: 69207. 332 / 596 Loading: 69209. 333 / 596 Loading: 69210. 334 / 596 Loading: 69211. 335 / 596 Loading: 69212. 336 / 596 Loading: 69213. 337 / 596 Loading: 69214. 338 / 596 Loading: 69215. 339 / 596 Loading: 69216. 340 / 596 Loading: 69217. 341 / 596 Loading: 69218. 342 / 596 Loading: 69219. 343 / 596 Loading: 69220. 344 / 596 Loading: 69221. 345 / 596 Loading: 69222. 346 / 596 Loading: 69223. 347 / 596 Loading: 69224. 348 / 596 Loading: 69225. 349 / 596 Loading: 69226. 350 / 596 Loading: 69227. 351 / 596 Loading: 69228. 352 / 596 Loading: 69229. 353 / 596 Loading: 69230. 354 / 596 Loading: 69231. 355 / 596 Loading: 69232. 356 / 596 Loading: 69233. 357 / 596 Loading: 69234. 358 / 596 Loading: 69235. 359 / 596 Loading: 69236. 360 / 596 Loading: 69237. 361 / 596 Loading: 69238. 362 / 596 Loading: 69239. 363 / 596 Loading: 69240. 364 / 596 Loading: 69241. 365 / 596 Loading: 69242. 366 / 596 Loading: 69243. 367 / 596 Loading: 69244. 368 / 596 Loading: 69245. 369 / 596 Loading: 69246. 370 / 596 Loading: 69247. 371 / 596 Loading: 69248. 372 / 596 Loading: 69249. 373 / 596 Loading: 69250. 374 / 596 Loading: 69251. 375 / 596 Loading: 69252. 376 / 596 Loading: 69253. 377 / 596 Loading: 69254. 378 / 596 Loading: 69255. 379 / 596 Loading: 69256. 380 / 596 Loading: 69257. 381 / 596 Loading: 69259. 382 / 596 Loading: 69260. 383 / 596 Loading: 69262. 384 / 596 Loading: 69263. 385 / 596 Loading: 69264. 386 / 596 Loading: 69265. 387 / 596 Loading: 69267. 388 / 596 Loading: 69268. 389 / 596 Loading: 69269. 390 / 596 Loading: 69270. 391 / 596 Loading: 69271. 392 / 596 Loading: 69272. 393 / 596 Loading: 69273. 394 / 596 Loading: 69274. 395 / 596 Loading: 69275. 396 / 596 Loading: 69276. 397 / 596 Loading: 69277. 398 / 596 Loading: 69278. 399 / 596 Loading: 69279. 400 / 596 Loading: 69280. 401 / 596 Loading: 69282. 402 / 596 Loading: 69283. 403 / 596 Loading: 69285. 404 / 596 Loading: 69286. 405 / 596 Loading: 69287. 406 / 596 Loading: 69288. 407 / 596 Loading: 69289. 408 / 596 Loading: 69291. 409 / 596 Loading: 69292. 410 / 596 Loading: 69293. 411 / 596 Loading: 69295. 412 / 596 Loading: 69296. 413 / 596 Loading: 69297. 414 / 596 Loading: 69298. 415 / 596 Loading: 69299. 416 / 596 Loading: 69300. 417 / 596 Loading: 69302. 418 / 596 Loading: 69303. 419 / 596 Loading: 69304. 420 / 596 Loading: 69305. 421 / 596 Loading: 69306. 422 / 596 Loading: 69307. 423 / 596 Loading: 69308. 424 / 596 Loading: 69312. 425 / 596 Loading: 69314. 426 / 596 Loading: 69315. 427 / 596 Loading: 69316. 428 / 596 Loading: 69318. 429 / 596 Loading: 69319. 430 / 596 Loading: 69320. 431 / 596 Loading: 69322. 432 / 596 Loading: 69323. 433 / 596 Loading: 69324. 434 / 596 Loading: 69325. 435 / 596 Loading: 69326. 436 / 596 Loading: 69327. 437 / 596 Loading: 69328. 438 / 596 Loading: 69329. 439 / 596 Loading: 69330. 440 / 596 Loading: 69331. 441 / 596 Loading: 69332. 442 / 596 Loading: 69333. 443 / 596 Loading: 69334. 444 / 596 Loading: 69335. 445 / 596 Loading: 69336. 446 / 596 Loading: 69337. 447 / 596 Loading: 69338. 448 / 596 Loading: 69340. 449 / 596 Loading: 69343. 450 / 596 Loading: 70219. 451 / 596 Loading: 70220. 452 / 596 Loading: 70221. 453 / 596 Loading: 70223. 454 / 596 Loading: 70224. 455 / 596 Loading: 70225. 456 / 596 Loading: 70256. 457 / 596 Loading: 70259. 458 / 596 Loading: 70260. 459 / 596 Loading: 70262. 460 / 596 Loading: 70263. 461 / 596 Loading: 70264. 462 / 596 Loading: 70270. 463 / 596 Loading: 70271. 464 / 596 Loading: 70272. 465 / 596 Loading: 70273. 466 / 596 Loading: 70275. 467 / 596 Loading: 70276. 468 / 596 Loading: 70277. 469 / 596 Loading: 70280. 470 / 596 Loading: 70281. 471 / 596 Loading: 70282. 472 / 596 Loading: 70283. 473 / 596 Loading: 70284. 474 / 596 Loading: 70286. 475 / 596 Loading: 70287. 476 / 596 Loading: 70288. 477 / 596 Loading: 70289. 478 / 596 Loading: 70291. 479 / 596 Loading: 70292. 480 / 596 Loading: 70293. 481 / 596 Loading: 70294. 482 / 596 Loading: 70295. 483 / 596 Loading: 70296. 484 / 596 Loading: 70297. 485 / 596 Loading: 70298. 486 / 596 Loading: 70300. 487 / 596 Loading: 70301. 488 / 596 Loading: 70302. 489 / 596 Loading: 70303. 490 / 596 Loading: 70304. 491 / 596 Loading: 70305. 492 / 596 Loading: 70306. 493 / 596 Loading: 70307. 494 / 596 Loading: 70308. 495 / 596 Loading: 70309. 496 / 596 Loading: 7525. 497 / 596 Loading: 7529. 498 / 596 Loading: 7530. 499 / 596 Loading: 7531. 500 / 596 Loading: 7532. 501 / 596 Loading: 7533. 502 / 596 Loading: 7534. 503 / 596 Loading: 7535. 504 / 596 Loading: 7536. 505 / 596 Loading: 7537. 506 / 596 Loading: 7538. 507 / 596 Loading: 7539. 508 / 596 Loading: 7540. 509 / 596 Loading: 7541. 510 / 596 Loading: 7542. 511 / 596 Loading: 7543. 512 / 596 Loading: 7544. 513 / 596 Loading: 7545. 514 / 596 Loading: 7546. 515 / 596 Loading: 7547. 516 / 596 Loading: 7548. 517 / 596 Loading: 7549. 518 / 596 Loading: 7550. 519 / 596 Loading: 7551. 520 / 596 Loading: 7552. 521 / 596 Loading: 7553. 522 / 596 Loading: 7554. 523 / 596 Loading: 7555. 524 / 596 Loading: 7556. 525 / 596 Loading: 7557. 526 / 596 Loading: 7558. 527 / 596 Loading: 7559. 528 / 596 Loading: 7560. 529 / 596 Loading: 7561. 530 / 596 Loading: 7562. 531 / 596 Loading: 7563. 532 / 596 Loading: 7564. 533 / 596 Loading: 7565. 534 / 596 Loading: 7566. 535 / 596 Loading: 7567. 536 / 596 Loading: 7568. 537 / 596 Loading: 7569. 538 / 596 Loading: 7570. 539 / 596 Loading: 7571. 540 / 596 Loading: 7572. 541 / 596 Loading: 7576. 542 / 596 Loading: 7577. 543 / 596 Loading: 7578. 544 / 596 Loading: 7579. 545 / 596 Loading: 7580. 546 / 596 Loading: 7581. 547 / 596 Loading: 7582. 548 / 596 Loading: 7583. 549 / 596 Loading: 7584. 550 / 596 Loading: 7585. 551 / 596 Loading: 7586. 552 / 596 Loading: 8649. 553 / 596 Loading: 8650. 554 / 596 Loading: 8651. 555 / 596 Loading: 8652. 556 / 596 Loading: 8655. 557 / 596 Loading: 8656. 558 / 596 Loading: 8657. 559 / 596 Loading: 8658. 560 / 596 Loading: 9575. 561 / 596 Loading: 9581. 562 / 596 Loading: 9592. 563 / 596 Loading: 9602. 564 / 596 Loading: 9609. 565 / 596 Loading: 9620. 566 / 596 Loading: 9636. 567 / 596 Loading: 9642. 568 / 596 Loading: 9650. 569 / 596 Loading: 9661. 570 / 596 Loading: 9673. 571 / 596 Loading: 9682. 572 / 596 Loading: 9695. 573 / 596 Loading: 9700. 574 / 596 Loading: 9717. 575 / 596 Loading: 9726. 576 / 596 Loading: 9736. 577 / 596 Loading: 9742. 578 / 596 Loading: 9754. 579 / 596 Loading: 9765. 580 / 596 Loading: 9774. 581 / 596 Loading: 9783. 582 / 596 Loading: 9794. 583 / 596 Loading: 9799. 584 / 596 Loading: 9811. 585 / 596 Loading: 9827. 586 / 596 Loading: 9837. 587 / 596 Loading: 9855. 588 / 596 Loading: 9860. 589 / 596 Loading: 9870. 590 / 596 Loading: 9880. 591 / 596 Loading: 9889. 592 / 596 Loading: 9912. 593 / 596 Loading: 9924. 594 / 596 Loading: 9928. 595 / 596 Loading: 9948. 596 / 596
shots = df[~df['shot'].isnull()]
shots.reset_index(drop = True, inplace = True)
shots.head(3)
id | index | period | timestamp | minute | second | type | possession | possession_team | play_pattern | ... | match_id | clearance | off_camera | miscontrol | 50_50 | out | injury_stoppage | half_start | player_off | half_end | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 65f16e50-7c5d-4293-b2fc-d20887a772f9 | 148 | 1 | 2021-01-03 00:02:29.094 | 2 | 29 | {'id': 16, 'name': 'Shot'} | 6 | {'id': 217, 'name': 'Barcelona'} | {'id': 1, 'name': 'Regular Play'} | ... | 15946 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | b0f73423-3990-45ae-9dda-3512c2d1aff3 | 283 | 1 | 2021-01-03 00:05:39.239 | 5 | 39 | {'id': 16, 'name': 'Shot'} | 11 | {'id': 217, 'name': 'Barcelona'} | {'id': 1, 'name': 'Regular Play'} | ... | 15946 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 13b1ddab-d22e-43d9-bfe4-12632fea1a27 | 755 | 1 | 2021-01-03 00:15:28.625 | 15 | 28 | {'id': 16, 'name': 'Shot'} | 26 | {'id': 217, 'name': 'Barcelona'} | {'id': 8, 'name': 'From Keeper'} | ... | 15946 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 43 columns
left_post_x, left_post_y = (120, 36)
right_post_x, right_post_y = (120, 44)
shots_model=pd.DataFrame(columns=['goal','x','y'])
for index, shot in shots.iterrows():
# Keep only open play
open_play = 0
if shot['shot']['type']['name'] == 'Open Play':
open_play = 1
# Keep non-headed shots
header = 0
if shot['shot']['body_part']['name'] == 'Head':
header = 1
if (open_play == 1) & (header == 0):
# Goal
if shot['shot']['outcome']['name'] == 'Goal':
shots_model.loc[index, 'goal'] = 1
else:
shots_model.loc[index, 'goal'] = 0
# X, Y locations
shot_location_x, shot_location_y = shot['location']
shots_model.loc[index, 'x'] = shot_location_x
shots_model.loc[index, 'y'] = shot_location_y
shots_model.loc[index,'c']=abs(shot_location_y-40)
# Distance to centre of goal and angle
shots_model.loc[index,'distance']=np.sqrt((120-shot_location_x)**2 + (40-shot_location_y)**2)
# Angle to goal (radians)
a = np.sqrt((shot_location_x - right_post_x)**2 + (shot_location_y - right_post_y)**2)
b = np.sqrt((left_post_x - right_post_x)**2 + (left_post_y - right_post_y)**2)
c = np.sqrt((left_post_x - shot_location_x)**2 + (left_post_y - shot_location_y)**2)
angle_ac = np.arccos((a**2 + c**2 - b**2)/(2*a*c))
if angle_ac<0:
angle_ac=np.pi+angle_ac
shots_model.loc[index,'angle'] =angle_ac
# Play pattern
shots_model.loc[index, 'play_pattern'] = shot['play_pattern']['name']
# Body part
shots_model.loc[index, 'body_part'] = shot['shot']['body_part']['name']
# First time
shots_model.loc[index, 'first_time'] = 0
if 'first_time' in shot['shot'].keys():
shots_model.loc[index, 'first_time'] = 1
# Technique
shots_model.loc[index, 'technique'] = shot['shot']['technique']['name']
# Pressure
shots_model.loc[index, 'under_pressure'] = 0
if shot['under_pressure'] == True:
shots_model.loc[index, 'under_pressure'] = 1
# Freeze Frame - defender location
# Number of defenders between shot location and goal
# Distance to nearest defender between shot location and goal
# Distance to nearest defender
# TBD: position of nearest defender
# TBD: angle of goal left after removing blocked defenders (assume ~1m width)
freeze_frame_def = []
for player in shot['shot']['freeze_frame']:
if player['teammate'] == False:
freeze_frame_def.append(player)
distance_nearest_defender = None
distance_nearest_blocking_defender = None
blocking_defender = []
for defender in freeze_frame_def:
# defender specific
defender_x, defender_y = defender['location']
distance_defender = np.sqrt((shot_location_x - defender_x)**2 + (shot_location_y - defender_y)**2)
if distance_nearest_defender == None:
distance_nearest_defender = distance_defender
elif distance_defender < distance_nearest_defender:
distance_nearest_defender = distance_defender
distance_defender = None
blocking = mfun.is_inside(shot_location_x, shot_location_y
, left_post_x, left_post_y
, right_post_x, right_post_y
, defender_x, defender_y)
if blocking == True:
# If defender is blocking part of the goal..
blocking_defender.append(defender)
# Measure distance of blocking defender
distance_blocking_defender = np.sqrt((shot_location_x - defender_x)**2 + (shot_location_y - defender_y)**2)
if distance_nearest_blocking_defender == None:
distance_nearest_blocking_defender = distance_blocking_defender
elif distance_blocking_defender < distance_nearest_blocking_defender:
distance_nearest_blocking_defender = distance_blocking_defender
distance_blocking_defender = None
shots_model.loc[index, 'distance_nearest_defender'] = distance_nearest_defender
shots_model.loc[index, 'distance_nearest_blocking_defender'] = distance_nearest_blocking_defender
shots_model.loc[index, 'number_blocking_defenders'] = len(blocking_defender)
# Key Pass info
# TBD - get info from previous pass
# Eg. cross / through ball / where it was etc
# StatsBomb xG
shots_model.loc[index, 'statsbomb_xg'] = shot['shot']['statsbomb_xg']
# # Make locations numeric
# shots_model['x'] = pd.to_numeric(shots_model['x'])
# shots_model['y'] = pd.to_numeric(shots_model['y'])
# # Try squared distances
# shots_model['d2'] = shots_model['distance']**2
# shots_model['x2'] = shots_model['x']**2
# shots_model['c2'] = shots_model['c']**2
# # Try angle * x location
# shots_model['ax'] = shots_model['angle']*shots_model['x']
C:\Users\Ciaran\AppData\Roaming\Python\Python36\site-packages\ipykernel_launcher.py:36: RuntimeWarning: invalid value encountered in arccos
shots_model.to_pickle("./shots_model.pkl")
# Load shot model
shots_model = pd.read_pickle("./shots_model.pkl")
shots_model.head()
goal | x | y | c | distance | angle | play_pattern | body_part | first_time | technique | under_pressure | distance_nearest_defender | distance_nearest_blocking_defender | number_blocking_defenders | statsbomb_xg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 111.7 | 51.7 | 11.7 | 14.345034 | 0.336567 | Regular Play | Right Foot | 1.0 | Half Volley | 0.0 | 1.303840 | NaN | 0.0 | 0.075164 |
1 | 0 | 114 | 27 | 13.0 | 14.317821 | 0.248710 | Regular Play | Left Foot | 1.0 | Volley | 0.0 | 3.700000 | NaN | 0.0 | 0.062892 |
2 | 0 | 92 | 34.5 | 5.5 | 28.535066 | 0.273578 | From Keeper | Left Foot | 0.0 | Normal | 0.0 | 2.884441 | 5.124451 | 1.0 | 0.020535 |
4 | 0 | 107 | 25 | 15.0 | 19.849433 | 0.268489 | From Corner | Right Foot | 0.0 | Normal | 0.0 | 3.244996 | NaN | 0.0 | 0.035420 |
5 | 0 | 108.1 | 27.4 | 12.6 | 17.331186 | 0.323048 | Regular Play | Left Foot | 1.0 | Half Volley | 1.0 | 2.039608 | NaN | 0.0 | 0.089920 |
# Separate into train, test data for modelling
X_train, X_test, y_train, y_test = train_test_split(shots_model.loc[:, shots_model.columns != 'goal']
, shots_model.loc[:, shots_model.columns == 'goal']
, test_size=0.2, random_state=42)
train = pd.concat([y_train, X_train], axis=1)
test = pd.concat([y_test, X_test], axis=1)
# A GLM for fitting goal probability
model_variables = ['distance'
, 'angle'
, 'distance_nearest_defender'
, 'number_blocking_defenders'
]
model=''
for v in model_variables[:-1]:
model = model + v + ' + '
model = model + model_variables[-1]
#Fit the model
xG_model = smf.glm(formula="goal ~ " + model, data=train,
family=sm.families.Binomial()).fit()
print(xG_model.summary())
xG_model_params=xG_model.params
Generalized Linear Model Regression Results ================================================================================== Dep. Variable: ['goal[0]', 'goal[1]'] No. Observations: 9295 Model: GLM Df Residuals: 9290 Model Family: Binomial Df Model: 4 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -2901.7 Date: Sun, 03 Jan 2021 Deviance: 5803.5 Time: 11:31:04 Pearson chi2: 9.45e+03 No. Iterations: 6 Covariance Type: nonrobust ============================================================================================= coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------------------- Intercept 1.0519 0.205 5.122 0.000 0.649 1.454 distance 0.1080 0.008 12.989 0.000 0.092 0.124 angle -1.6109 0.181 -8.917 0.000 -1.965 -1.257 distance_nearest_defender -0.1242 0.021 -5.858 0.000 -0.166 -0.083 number_blocking_defenders 0.3260 0.054 6.053 0.000 0.220 0.432 =============================================================================================
# Calculate xG for GLM using each shot as input (row of shots_model)
def calculate_xG(sh):
# For the model 'b', get the intercept
bsum=xG_model_params[0]
# For as many variables as put in the model,
# multiply the coefficient by the value of that shot.
for i,v in enumerate(model_variables):
# bsum = intercept + (coefficient * variable value)
bsum=bsum+xG_model_params[i+1]*sh[v]
# Calculate probability of goal as 1 / 1 + exp(model output)
xG = 1/(1+np.exp(bsum))
return xG
#Add an xG to my dataframe
train_xG=train.apply(calculate_xG, axis=1)
train['xG'] = train_xG
test_xG=test.apply(calculate_xG, axis=1)
test['xG'] = test_xG
test.tail()
goal | x | y | c | distance | angle | play_pattern | body_part | first_time | technique | under_pressure | distance_nearest_defender | distance_nearest_blocking_defender | number_blocking_defenders | statsbomb_xg | xG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3580 | 0 | 102.5 | 23.8 | 16.2 | 23.847222 | 0.248088 | From Corner | Right Foot | 0.0 | Normal | 0.0 | 3.894868 | NaN | 0.0 | 0.025732 | 0.060465 |
13598 | 0 | 115 | 26 | 14.0 | 14.866069 | 0.192701 | Regular Play | Left Foot | 1.0 | Normal | 0.0 | 5.000000 | 10.440307 | 1.0 | 0.052736 | 0.113889 |
4691 | 0 | 102.1 | 36.3 | 3.7 | 18.278403 | 0.422998 | Regular Play | Left Foot | 0.0 | Half Volley | 0.0 | 1.664332 | NaN | 0.0 | 0.047882 | 0.105522 |
8187 | 1 | 112.5 | 38.1 | 1.9 | 7.736924 | 0.939567 | From Goal Kick | Right Foot | 0.0 | Normal | 0.0 | 1.204159 | NaN | 0.0 | 0.354846 | 0.444201 |
675 | 0 | 117.4 | 30.7 | 9.3 | 9.656604 | 0.263018 | Regular Play | Left Foot | 0.0 | Normal | 0.0 | 3.605551 | NaN | 0.0 | 0.520617 | 0.227407 |
left_post_x, left_post_y = (65, (36/80)*65)
right_post_x, right_post_y = (65, (44/80)*65)
for dist_defender in range(1, 10, 2):
for num_blocking_defenders in range(5):
xG_2d = np.zeros((65,65))
# For each x, y grid calculate xG
for x in range(65):
for y in range(65):
shot={}
# Distance to centre of goal and angle
shot['distance']=np.sqrt((65-x)**2 + (65/2-y)**2)
# Angle to goal (radians)
a = np.sqrt((x - right_post_x)**2 + (y - right_post_y)**2)
b = np.sqrt((left_post_x - right_post_x)**2 + (left_post_y - right_post_y)**2)
c = np.sqrt((left_post_x - x)**2 + (left_post_y - y)**2)
angle_ac = np.arccos((a**2 + c**2 - b**2)/(2*a*c))
if angle_ac<0:
angle_ac=np.pi+angle_ac
shot['angle'] =angle_ac
shot['distance_nearest_defender'] = dist_defender
shot['number_blocking_defenders'] = num_blocking_defenders
xG_2d[64-x,64-y] = calculate_xG(shot)
(fig,ax) = FCPython.createGoalMouth()
pos=ax.imshow(xG_2d, aspect = 'auto', origin="lower", extent=(0, 65, 0, 65), cmap=plt.cm.Reds)
fig.colorbar(pos, ax=ax)
ax.set_title('Expected Goals - # Blocking : '+str(num_blocking_defenders) + ' & Dist: ' + str(dist_defender))
plt.xlim((-1,66))
plt.ylim((-3,35))
plt.tight_layout()
plt.gca().set_aspect('equal', adjustable='box')
plt.show()
There is a positive correlation between the logistic regression model and StatsBomb's own model, which is reassuring. It's not nearly as sophisticated, but seems to generally work as intended after looking at the previous shot location probabilities depending on number of blocking defenders and distance to nearest defenders.
plt.scatter(test['xG'], test['statsbomb_xg'])
plt.show()