#!/usr/bin/env python # coding: utf-8 # # Ezega Data Analysis # # Author: [Meheret Samuel](https://github.com/senadev42) # #
# # ### The Pre-amble # A few months ago a friend of mine, [Kidus](https://github.com/kidesleo), decided to scrape ezega.com, which is a sort of index for bussinesses in ethiopia. The result was a jsonl file with about 8k entries that I've been sitting on with the vague goal of looking into but not finding the time to. # # Finally have time now, and while doing data analysis with clean datasets taken off the internet has been fun, it's not time to work with something dirty and real. # # ## Setting Up # # First let's import the libraries we're going to be using. # In[1]: import pandas as pd import matplotlib.pyplot as plt import re #This dataset has long columns pd.set_option('display.max_colwidth', None) # And then the file itself, setting lines=True to indicate that newlines are being used as a delimiter here because this is jsonl. # In[2]: df = pd.read_json('ezega_data.jsonl', lines=True) # Now let's take a peek at our data set. # In[3]: df.head() # As expected, it's a bit messy. # # At a glance, # - `business_url` and `business_description` look like they have a lot of values missing, # - the bussiness titles are inconsistently capitalized, # - some of the locations are cities while some of them are full addresses # - and the bussiness numbers are arrays that seem to be a mix of the Country Code +251 suffix and the simple, local 0 suffix # - the categories don't seem super relevant to the companies name so I suspect it wasn't registered properly. # # And there's probably other stuff that isn't immediately apparent from this quick look through, but let's work on cleaning this up first. # ## Clean Up # ### 1. Missing Values # # > business_url and business_description look like they have a lot of values missing # # Let's start up by seeing exactly how much is missing, and if there's any other fields that are missing data as well. # In[4]: print("Number of columns: ", df.shape[0]) df.isnull().sum() # Okay, that's actually a lot of missing urls and descriptions. Dropping the rows in question isn't an option when they make up over half of our dataset, so we can just replace them with placeholders for now. # In[5]: df['business_url'] = df['business_url'].fillna('N/A') df['business_description'] = df['business_description'].fillna('No description available') # Let's check again. # In[6]: df.isnull().sum() # In[7]: df.head() # Well that fixes that. # ### 2. Cleaning the title # # > the bussiness titles are inconsistently capitalized, # # This would be somewhat of a non-issue if I didn't plan on doing some analysis on the names themselves later. And it frankly feels like a tricky issue to solve. Let's look at the bussiness_title column and see what we're working with first. # In[8]: def is_fully_capitalized(title): ''' For every bussiness title: 1. Split it into words using space as a separator 2. Count the number of words int he title 3. Check if each word is capital, and count the number of fully capital words 4. If that is equal to the total amount of words then the title is full capitalized, return true ''' words = title.split() total_words = len(words) capitalized_words = sum(word.isupper() for word in words) return capitalized_words == total_words fully_capitalized_titles = df[df['business_title'].apply(is_fully_capitalized)] print("Number of titles where all the words are capitalized") print(fully_capitalized_titles['business_title'].shape[0]) # Okay that's nearly half the titles. Also not very meaningful. # # Putting it in a binary yes-or-now way doesn't seem to be the best way of looking at this so let's look at a distribution instead. # # In[9]: def count_fully_capitalized_words(title): words = title.split() capitalized_words = sum(word.isupper() for word in words) return capitalized_words df['fully_capitalized_words_count'] = df['business_title'].apply(count_fully_capitalized_words) plt.hist(df['fully_capitalized_words_count'], bins=range(df['fully_capitalized_words_count'].max() + 2), edgecolor='black') plt.xlabel('Number of Fully Capitalized Words') plt.ylabel('Count') plt.title('Distribution of Fully Capitalized Words in Titles') plt.show() # So the half the bussiness titles have no full uppercase words, and for the ones that do contain fully capitalized titles the count seems to peak at 3 and can extend as far as ... 14. Actually I want to see the full distribution for the word length now. # In[10]: def count_words(title): words = title.split() return len(words) df['total_words_count'] = df['business_title'].apply(count_words) plt.hist(df['total_words_count'], bins=range(df['total_words_count'].max() + 2), edgecolor='black') plt.xlabel('Number of Words in Bussiness Title') plt.ylabel('Count') plt.title('Distribution of Words in Titles') plt.show() # So we've got titles as short as 0 words and as long as 17.5(?) words. Let's take a look at the first group. # # # In[11]: print("Number of titles with 0 words: ", df[df['total_words_count'] == 0].shape[0]) df[df['total_words_count'] == 0].head() # We'll have to drop these I suppose since there's not much we can do with them. The locations are malformed and the urls go nowhere as well. # In[12]: df = df[df['total_words_count'] != 0] df[df['total_words_count'] == 0].head() # Back to the distribution. Let's now look at the second and figure out why we've got such long titles. # In[13]: upper_bound = 14 plt.hist(df[df['total_words_count'] >= upper_bound]['total_words_count'], bins=range(upper_bound, df[df['total_words_count'] >= upper_bound]['total_words_count'].max() ), edgecolor='black') plt.xlabel('Number of Words in Bussiness Title') plt.ylabel('Count') plt.title('Distribution of Words in Titles') plt.show() print(df[df['total_words_count'] >= upper_bound].shape) df[df['total_words_count'] >= upper_bound].head() # In[14]: upper_bound = 15 print(df[df['total_words_count'] >= upper_bound].shape) df[df['total_words_count'] >= upper_bound].head() # In[15]: upper_bound = 16 print(df[df['total_words_count'] >= upper_bound].shape) df[df['total_words_count'] >= upper_bound].head() # Browsing through the titles with different legnths shows that the title inflation is essential a result of poor data entry and/or registration practices. Some of them have legitimate reasons to be that long while others are a result of business owners entering contant information or taglines into title names. # # Regardless we'll have to find a way to work around it. # In[ ]: