#!/usr/bin/env python # coding: utf-8 # # Data Exploration and Reporting with PyHive # In this example we show how to explore data in Hive and build reports. # # The example uses a dataset of real estate sales in the Sacramento area, which you can download from [here](http://snurran.sics.se/hops/hive/Sacramentorealestatetransactions.csv). # Create a dataset with a name of your choosing, for example `RawData` and upload the CSV file. Make sure the dataset is empty, you will need to delete the auto-generated README.md # # First we need to setup the IPython magic and the connection information by running the following cell: # In[1]: get_ipython().run_line_magic('reload_ext', 'sql') from hops import hive hive.setup_hive_connection() # ## Load Data into an External table # Let's first create a Hive external and let's point the table to the just created dataset. In the query below you should replace `[Projectname]` with your project name. # # The `%%sql` magic allows you to write HiveQL and execute the query against the HiveServer. # In[ ]: get_ipython().run_cell_magic('sql', '', "CREATE EXTERNAL TABLE sacramento_properties_ext(\nstreet string,\ncity string,\nzip int,\nstate string,\nbeds int,\nbaths int,\nsq__ft float,\nsales_type string,\nsale_date string,\nprice float,\nlatitude float,\nlongitude float)\nROW FORMAT DELIMITED\nFIELDS TERMINATED BY ','\nLOCATION '/Projects/[Projectname]/RawData'\n") # To make sure the table was created successfully, we can list all the tables in the project's database and run a simple query against the external table. # In[22]: get_ipython().run_cell_magic('sql', '', 'show tables\n') # In[23]: get_ipython().run_cell_magic('sql', '', 'select * from sacramento_properties_ext limit 10\n') # ## Load data into a Managed Table # In this section we are going to load the data in a managed table. The table is going to be partitioned by ZIP and stored in ORC (a columnar storage format). # In[ ]: get_ipython().run_cell_magic('sql', '', 'CREATE TABLE sacramento_properties(\nstreet string,\ncity string,\nstate string,\nbeds int,\nbaths int,\nsq__ft float,\nsales_type string,\nsale_date string,\nprice float,\nlatitude float,\nlongitude float)\nPARTITIONED by (zip int)\nSTORED AS ORC\n') # In[17]: get_ipython().run_cell_magic('sql', '', 'show tables\n') # By default we don't allow dynamic inserts. This means that we should specify, for each row, which partition it belongs to. This is meant to avoid having a wrong insert query create a huge number of directories. # # In this case, however, the dataset is small and we know what we are doing. In the next cell we configure the hive session to allow nonstrict dymanic insert. # In[ ]: get_ipython().run_cell_magic('sql', '', 'set hive.exec.dynamic.partition=true;\nset hive.exec.dynamic.partition.mode=nonstrict;\n') # Now we load the data into the managed table. Partition column(s) should be listed last in the `SELECT` statement. # In[ ]: get_ipython().run_cell_magic('sql', '', 'INSERT OVERWRITE TABLE sacramento_properties PARTITION (zip)\nSELECT street, city, state, beds, baths, sq__ft, sales_type, sale_date, price, latitude, longitude, zip \nFROM sacramento_properties_ext\n') # In[24]: get_ipython().run_cell_magic('sql', '', 'select * \nFROM sacramento_properties \nWHERE zip=95608\nLIMIT 10\n') # In[26]: get_ipython().run_cell_magic('sql', '', 'select sales_type, avg(price) as avg_price\nFROM sacramento_properties \nWHERE zip=95608\nGROUP BY sales_type\nLIMIT 10\n') # ## Visualize the data # The query result can also be stored in a Python dictionary. This enables us to choose from a plethora of Python libraries available online, to visualize data and build BI reports # # In the example below we use the folium library to mark where the Condos in our dataset are located in Sacramento. For the next section to work, if you haven't already, please install the folium package in your project environment and restart the kernel. (Also, please re-run the first cell) # # (Internet access is required to visualize the map) # In[19]: condos = get_ipython().run_line_magic('sql', "select * from sacramento_properties where `sales_type` = 'Condo'") # In[20]: import folium m = folium.Map( location=[38.5815700, -121.4944000], zoom_start=10 ) for condo in condos: folium.Marker([float(condo[9]), float(condo[10])], popup=str(condo[8])).add_to(m) m # In[ ]: