#!/usr/bin/env python # coding: utf-8 # # Data scraping, data wrangling, data analytics, exploratory data analysis, advaced plotting and clustering with love. # In this series of tutorials, we are going to address how to do data scraping, data wrangling, data analytics and advaced plotting. This is a sport analytics case. # In this particular tutorial we address how to extract data or data scraping from the web. We also address some basic `DataFrame` manipulation. # In[ ]: # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: # In[3]: import requests import pandas as pd # In[ ]: # In order to do data analytics, we need data. So the first thing to do is get the data from somewhere. # # Hereafter we are going to work with NBA players data. In the NBA site stats.nba.com you will find all the statistics for every single player. # # In the following blog post [1] # # http://www.gregreda.com/2015/02/15/web-scraping-finding-the-api/ # # you will find a pretty good explanation of how to get the data from stats.nba.com [2] (or any other site for this matter). # # Remember, the most important part when scraping data from a web site, is knowing how to access the API used to collect the data. # In[ ]: # To get Lebron James shot chart data we will use this url: # In[4]: #Lebron James data #PlayerID is the number in the nba.stats site shot_chart_url = 'http://stats.nba.com/stats/shotchartdetail?CFID=33&CFPAR'\ 'AMS=2014-15&ContextFilter=&ContextMeasure=FGA&DateFrom=&D'\ 'ateTo=&GameID=&GameSegment=&LastNGames=0&LeagueID=00&Loca'\ 'tion=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&'\ 'PaceAdjust=N&PerMode=PerGame&Period=0&PlayerID=2544&Plu'\ 'sMinus=N&Position=&Rank=N&RookieYear=&Season=2014-15&Seas'\ 'onSegment=&SeasonType=Regular+Season&TeamID=0&VsConferenc'\ 'e=&VsDivision=&mode=Advanced&showDetails=0&showShots=1&sh'\ 'owZones=0' # Have in mind that the previous url corresponds to Lebron James shot chart data. If you want to access different data, another player or the data for a specifc team, you will need to find the right link. # In[ ]: # By now, we already know that the file is in json format. If you have **JSONView** [3] installed in your web browser, you can view the content of the file. Just copy and paste the link in your web browser # # http://stats.nba.com/stats/shotchartdetail?CFID=33&CFPARAMS=2014-15&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&GameID=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerID=2544&PlusMinus=N&Position=&Rank=N&RookieYear=&Season=2014-15&SeasonSegment=&SeasonType=Regular+Season&TeamID=0&VsConference=&VsDivision=&mode=Advanced&showDetails=0&showShots=1&showZones=0 # In[ ]: # Now let's use the module **requests** [4] to get the server data from `shot_chart_url`. # # **requests** is an Apache2 Licensed HTTP library, you can find more information about requests in the following link http://docs.python-requests.org/en/latest/# # In[5]: # Get the webpage containing the data response = requests.get(shot_chart_url) # In[ ]: # We can view the server data header as follows: # In[6]: response.headers # In[ ]: # Now we can access the header data using any string we want, no need to say that the string must exist in the server headers: # In[7]: response.headers['content-type'] # As we can see, the data is in json format. # # By the way, when we were doing web scraping we already knew that the data was in json format. I just showed you how to get that info from the header. # In[ ]: # In the module **requests**, there is also a builtin JSON decoder. To print the content of `responce`, we can proceed as follows # In[8]: #uncomment this line to print the content of responce #response.json() #to time the function #%time response.json() # In[ ]: # Now we are ready to get the data we want in order to construct the **pandas** [5] `DataFrame`. # In[9]: # Grab the headers to be used as column headers for our DataFrame headers = response.json()['resultSets'][0]['headers'] # The data is in standard json format, and is made of three main blocks. # # We are interested in getting the data from the block **`resultSets`**, therefore the notation **`response.json()['resultSets']`** # # The **`resultSets`** block has two sub-blocks, we want to access the first one or the one with the name **`Shot_Chart_Detail`**, therefore the notation **`response.json()['resultSets'][0]`** # # Now we want to access the information contained in **`headers`**, hence the notation **`response.json()['resultSets'][0]['headers']`** # # The object **`response.json()['resultSets'][0]['headers']`** contains the headers' names of each column. # # # # In[ ]: # To grab the shot chart data or **`Shot Chart Detail`** in the json data, we proceed in a similar way. Have in mind that the data of interest is located in the block **`rowSet`**. Therefore the notation **`response.json()['resultSets'][0]['rowSet']`** # In[10]: # Grab the shot chart data shots = response.json()['resultSets'][0]['rowSet'] # In[ ]: # To know the type of the variables `headers` and `shots`, we can proceed as follows: # In[11]: type(headers) # In[12]: type(shots) # In[ ]: # Now we can create a **pandas** `DataFrame` using the scraped shot chart data. # # Remember, the data is saved in the objects `shots` and `headers` created in the previous step. # # To create the `DataFrame`, we can proceed as follows: # # In[13]: shot_df = pd.DataFrame(data=shots, columns=headers) # At this point, we have a pandas' `DataFrame` ready to use. # In[ ]: # The `DataFrame` `shot_df` contains the shot chart data of all the the field goal attempts Lebron James took during the 2014-15 regular season. # # We are specifically interested in the data saved in the columns `LOC_X`, `LOC_Y` and `SHOT_MADE_FLAG`. # # `LOC_X` and `LOC_Y` are the coordinate values for each shot attempted and `SHOT_MADE_FLAG` contains the outcome of the shot (missed it or made it). # # To display the data saved in `shot_df`, you can proceed as follows # # In[14]: #shot_df.head() shot_df.head(4) # As we are only interested in printing the first 4 rows, we use **.head(4)**. If you do not use **.head(4)** it will display all the rows. # In[ ]: # To print a concise summary of the `DataFrame`, # In[57]: shot_df.info() # In[ ]: # To print the names of the columns in the `DataFrame`, # In[58]: shot_df.columns # In[ ]: # And in a similar way we can print the names of the rows (indexs) in the `DataFrame`, # In[59]: shot_df.index # In[ ]: # We can also display all the columns belonging to rows 0 to 2, as follows, # In[60]: shot_df.iloc[0:2,:] # In[ ]: # or we can display the first two columns of the first four rows, as follows, # In[61]: shot_df.iloc[0:4,0:2] # In[ ]: # If you do not define the labels of the columns in the **pandas** `DataFrame`, the column labels will default to `np.arange(n)`. # In[16]: shot_df1 = pd.DataFrame(data=shots) shot_df1.head(2) # In[ ]: # If you have many columns, **pandas** will not shown all them. To force **pandas** to display all the columns, you can proceed as follows, # In[17]: #This will force pandas to display any number of columns. #pd.set_option('display.max_columns', 6) pd.set_option('display.max_columns', None) shot_df.head(2) # In[ ]: # Alternatively, you can use the API for display objects in IPython. # In[18]: # View the head of the DataFrame and all its columns from IPython.display import display with pd.option_context('display.max_columns', None): display(shot_df.head(2)) # In[ ]: # If you want to save the **pandas** `DataFrame` in a csv file, you can proceed as follows # In[19]: shot_df.to_csv(path_or_buf='test.csv',mode='w') # In[ ]: # If you want to time the execution of a Python statement or expression, you can use the `%time` magic, # In[20]: get_ipython().run_line_magic('time', "shot_df.to_csv(path_or_buf='test.csv',mode='w')") # In[ ]: # If you want to add reusability to your code, you can create functions. Let us create a function to save the json data in csv format. # In[21]: def savecsv(name_of_file): shot_df.to_csv(path_or_buf=name_of_file,mode='w') # We can now access the function, # In[22]: name_of_file='test1.csv' savecsv(name_of_file) # In[ ]: # If you want to save the server data from the url `shot_chart_url` in a json file, we need to use the module **json** [6]. # In[23]: import json with open('data_json.json', 'w') as outfile: json.dump(response.json(), outfile) #json.dump(response.json(), open('data1_json.json', 'w')) # In[24]: #This method is not working #obj = open('data1_json.json', 'wb') #obj.write(str(response.json())) #obj.close # In[ ]: # By the way, do not erase the csv and json files, as we are going to use them later. # In[ ]: # Finally, in the blog post [7], you will find a very nice tutorial of how to create NBA shot charts. Hereafter, we are going to address and elaborate on most of the things explained in the mentioned blog post. # In[ ]: # ### In the next tutorial, we are going to do some data wrangling, data analytics and exploratory data analysis, using the `DataFrame` that we just created by scraping data from a web site. # In[ ]: # # References # # [1] http://www.gregreda.com/2015/02/15/web-scraping-finding-the-api/ # # [2] http://stats.nba.com # # [3] http://jsonview.com # # [4] http://docs.python-requests.org/en/latest/# # # [5] http://pandas.pydata.org/ # # [6] https://docs.python.org/2/library/json.html # # [7] http://savvastjortjoglou.com/nba-shot-sharts.html?utm_source=Python+Weekly+Newsletter&utm_campaign=5185ff0538-Python_Weekly_Issue_202_July_30_2015&utm_medium=email&utm_term=0_9e26887fc5-5185ff0538-312727397 # In[ ]: # In[ ]: # In[ ]: # In[25]: #import sys #print('Python version:', sys.version_info) #import IPython #print('IPython version:', IPython.__version__) #print('Requests version', requests.__version__) #print('Pandas version:', pd.__version__) #print('json version:', json.__version__) # In[ ]: