# We begin by importing the survey file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
Survey_2017 = pd.read_csv('2017-fCC-New-Coders-Survey-Data.csv')
/dataquest/system/env/python3/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (17,62) have mixed types. Specify dtype option on import or set low_memory=False.
# We are looking at the file
Survey_2017.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18175 entries, 0 to 18174 Columns: 136 entries, Age to YouTubeTheNewBoston dtypes: float64(105), object(31) memory usage: 18.9+ MB
# Looking at the size of the file
Survey_2017.shape
(18175, 136)
# We check columns names
Survey_2017.columns
Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampLoanYesNo', 'BootcampName', 'BootcampRecommend', 'ChildrenNumber', 'CityPopulation', 'CodeEventConferences', 'CodeEventDjangoGirls', ... 'YouTubeFCC', 'YouTubeFunFunFunction', 'YouTubeGoogleDev', 'YouTubeLearnCode', 'YouTubeLevelUpTuts', 'YouTubeMIT', 'YouTubeMozillaHacks', 'YouTubeOther', 'YouTubeSimplilearn', 'YouTubeTheNewBoston'], dtype='object', length=136)
# We display the data that is not clean with lots of NaN
pd.get_option("display.max_columns")
Survey_2017.head(15)
Age | AttendedBootcamp | BootcampFinish | BootcampLoanYesNo | BootcampName | BootcampRecommend | ChildrenNumber | CityPopulation | CodeEventConferences | CodeEventDjangoGirls | ... | YouTubeFCC | YouTubeFunFunFunction | YouTubeGoogleDev | YouTubeLearnCode | YouTubeLevelUpTuts | YouTubeMIT | YouTubeMozillaHacks | YouTubeOther | YouTubeSimplilearn | YouTubeTheNewBoston | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 27.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | more than 1 million | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 34.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | less than 100,000 | NaN | NaN | ... | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 21.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | more than 1 million | NaN | NaN | ... | NaN | NaN | NaN | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN |
3 | 26.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | between 100,000 and 1 million | NaN | NaN | ... | 1.0 | 1.0 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN |
4 | 20.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | between 100,000 and 1 million | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | 28.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | less than 100,000 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | CodingEntrepreneurs | NaN | 1.0 |
6 | 29.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | between 100,000 and 1 million | 1.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | 29.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | between 100,000 and 1 million | NaN | NaN | ... | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
8 | 23.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | less than 100,000 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | 24.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | more than 1 million | 1.0 | NaN | ... | 1.0 | 1.0 | NaN | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 1.0 |
10 | 20.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | less than 100,000 | NaN | NaN | ... | 1.0 | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN |
11 | 22.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | more than 1 million | NaN | NaN | ... | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | 18.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | less than 100,000 | NaN | NaN | ... | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13 | 44.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | between 100,000 and 1 million | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
14 | 27.0 | 1.0 | 1.0 | 0.0 | Code Fellows | 1.0 | NaN | between 100,000 and 1 million | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
15 rows × 136 columns
JobInterestPct=Survey_2017['JobRoleInterest'].value_counts()/len(Survey_2017['JobRoleInterest'])
JobInterestPct = JobInterestPct*100
JobInterestPct
Full-Stack Web Developer 4.528198 Front-End Web Developer 2.475928 Data Scientist 0.836314 Back-End Web Developer 0.781293 Mobile Developer 0.643741 Game Developer 0.627235 Information Security 0.506190 Full-Stack Web Developer, Front-End Web Developer 0.352132 Front-End Web Developer, Full-Stack Web Developer 0.308116 Product Manager 0.302613 Data Engineer 0.291609 User Experience Designer 0.286107 User Experience Designer, Front-End Web Developer 0.236589 Front-End Web Developer, Back-End Web Developer, Full-Stack Web Developer 0.214580 Back-End Web Developer, Full-Stack Web Developer, Front-End Web Developer 0.198074 Back-End Web Developer, Front-End Web Developer, Full-Stack Web Developer 0.198074 DevOps / SysAdmin 0.198074 Full-Stack Web Developer, Front-End Web Developer, Back-End Web Developer 0.170564 Front-End Web Developer, Full-Stack Web Developer, Back-End Web Developer 0.165062 Full-Stack Web Developer, Mobile Developer 0.159560 Front-End Web Developer, User Experience Designer 0.159560 Back-End Web Developer, Full-Stack Web Developer 0.148556 Full-Stack Web Developer, Back-End Web Developer 0.143054 Back-End Web Developer, Front-End Web Developer 0.110041 Full-Stack Web Developer, Back-End Web Developer, Front-End Web Developer 0.104539 Data Engineer, Data Scientist 0.104539 Front-End Web Developer, Mobile Developer 0.099037 Full-Stack Web Developer, Data Scientist 0.093535 Data Scientist, Data Engineer 0.088033 Mobile Developer, Game Developer 0.088033 ... Full-Stack Web Developer, Game Developer, Mobile Developer, Information Security 0.005502 Full-Stack Web Developer, Back-End Web Developer, DevOps / SysAdmin, Data Engineer 0.005502 DevOps / SysAdmin, Data Engineer, Quality Assurance Engineer, Data Scientist 0.005502 User Experience Designer, Full-Stack Web Developer, Back-End Web Developer, Product Manager, Information Security, Quality Assurance Engineer 0.005502 Non technical 0.005502 DevOps / SysAdmin, Information Security, Mobile Developer, Quality Assurance Engineer 0.005502 Game Developer, Front-End Web Developer, Back-End Web Developer, User Experience Designer, Mobile Developer, Full-Stack Web Developer 0.005502 Not Sure Yet 0.005502 Data Engineer, Data Scientist, Front-End Web Developer, Full-Stack Web Developer, Back-End Web Developer, Mobile Developer 0.005502 Full-Stack Web Developer, Front-End Web Developer, Quality Assurance Engineer, Product Manager, Back-End Web Developer 0.005502 Game Developer, DevOps / SysAdmin, Mobile Developer, Back-End Web Developer, Information Security, Data Engineer 0.005502 User Experience Designer, Data Engineer, Front-End Web Developer, Full-Stack Web Developer, Data Scientist, Back-End Web Developer 0.005502 Full-Stack Web Developer, Front-End Web Developer, Data Scientist, Back-End Web Developer, User Experience Designer, Mobile Developer, Data Engineer 0.005502 Information Security, Back-End Web Developer, Game Developer, Product Manager 0.005502 Game Developer, Mobile Developer, Data Scientist, Full-Stack Web Developer, Back-End Web Developer, Front-End Web Developer 0.005502 Game Developer, User Experience Designer, Back-End Web Developer, Front-End Web Developer 0.005502 Data Engineer, Data Scientist, Front-End Web Developer, DevOps / SysAdmin 0.005502 Mobile Developer, Game Developer, Front-End Web Developer, DevOps / SysAdmin 0.005502 Front-End Web Developer, Back-End Web Developer, Mobile Developer, Full-Stack Web Developer, Data Engineer 0.005502 Mobile Developer, Front-End Web Developer, Back-End Web Developer, Information Security 0.005502 Front-End Web Developer, Mobile Developer, Back-End Web Developer, Full-Stack Web Developer, Game Developer 0.005502 Product Manager, Back-End Web Developer, Data Scientist, Full-Stack Web Developer, Game Developer, User Experience Designer, Information Security 0.005502 User Experience Designer, Product Manager, Front-End Web Developer, Game Developer 0.005502 Game Developer, Information Security, Back-End Web Developer, Full-Stack Web Developer, Mobile Developer, Front-End Web Developer 0.005502 Game Developer, Information Security, Full-Stack Web Developer, User Experience Designer, Mobile Developer, Data Scientist, Back-End Web Developer, Front-End Web Developer, Data Engineer 0.005502 Game Developer, Information Security, Full-Stack Web Developer, Back-End Web Developer 0.005502 Data Engineer, milatary engineer 0.005502 Mobile Developer, Front-End Web Developer, Data Scientist, Full-Stack Web Developer, Data Engineer, Back-End Web Developer 0.005502 Data Engineer, Information Security, Game Developer, Full-Stack Web Developer, Mobile Developer, Data Scientist, User Experience Designer 0.005502 Data Scientist, DevOps / SysAdmin, Game Developer, Data Engineer, Robotics 0.005502 Name: JobRoleInterest, Length: 3213, dtype: float64
These data shows that "Full-Stack Web Developer" and "Front-End Web Developer" are the two most popular subjects.
However, because many e-learners are interested by more than
One subject, we cannot reach any conclusion.
We should look further into the data to find Out most popular subjects, especially if many learners are interested by more than
one subject.
We will start by cleaning the data and looking on the multiple subject selection by e-learners.
# We will start by cleaning the data from NaN values
Survey_2017_c = Survey_2017[Survey_2017['JobRoleInterest'].notna()]
# We will set the type of 'JobRoleInterest' to string in order to be able to analyze it.
Survey_2017_c['JobRoleInterest'].dtypes
Survey_2017_c['JobRoleInterest'] = Survey_2017_c['JobRoleInterest'].astype(str)
/dataquest/system/env/python3/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#Here we will add to the dataframe the column 'NumberOfIntrest' counting the number of job
Survey_2017_c['NumberOfIntrest']=Survey_2017_c['JobRoleInterest'].apply(lambda x: len(x.split(",")))
#Let's display some rows for both 'JobRoleInterest' and 'NumberOfIntrest'
Survey_2017_c[['JobRoleInterest','NumberOfIntrest']].head(25)
/dataquest/system/env/python3/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
JobRoleInterest | NumberOfIntrest | |
---|---|---|
1 | Full-Stack Web Developer | 1 |
2 | Front-End Web Developer, Back-End Web Develo... | 5 |
3 | Front-End Web Developer, Full-Stack Web Deve... | 3 |
4 | Full-Stack Web Developer, Information Security... | 5 |
6 | Full-Stack Web Developer | 1 |
9 | Full-Stack Web Developer, Quality Assurance ... | 6 |
11 | DevOps / SysAdmin, Data Scientist, Informa... | 4 |
13 | Back-End Web Developer, Full-Stack Web Develop... | 3 |
14 | Full-Stack Web Developer | 1 |
15 | Full-Stack Web Developer | 1 |
16 | Full-Stack Web Developer | 1 |
18 | Full-Stack Web Developer, Front-End Web Deve... | 4 |
19 | Front-End Web Developer, Mobile Developer,... | 4 |
21 | Information Security | 1 |
22 | Full-Stack Web Developer | 1 |
23 | Back-End Web Developer | 1 |
28 | Full-Stack Web Developer | 1 |
29 | Front-End Web Developer, Data Scientist, F... | 4 |
30 | Back-End Web Developer, Full-Stack Web Developer | 2 |
31 | Front-End Web Developer | 1 |
32 | Data Scientist, Information Security, Data E... | 3 |
33 | Full-Stack Web Developer, Quality Assurance ... | 2 |
34 | Back-End Web Developer, Full-Stack Web Developer | 2 |
35 | Back-End Web Developer, Full-Stack Web Develop... | 4 |
37 | Mobile Developer, Product Manager | 2 |
# We are lookng at the range of number of subjects of interest for learners
Survey_2017_c['NumberOfIntrest'].unique()
array([ 1, 5, 3, 6, 4, 2, 12, 7, 8, 10, 9, 11, 13])
# printing the frequency table of the number of subjects of interest
Survey_2017_c['NumberOfIntrest'].value_counts()
1 2213 3 1111 4 1064 5 842 2 761 6 470 7 270 8 123 9 69 10 33 12 21 11 13 13 2 Name: NumberOfIntrest, dtype: int64
# We can plot the histogram of the frequency of number of subjects of interest
plt.hist(Survey_2017_c['NumberOfIntrest'], bins=13)
plt.xticks(np.arange(0, 14, 1))
plt.title('Number of interest per e-learners')
plt.xlabel('number of subjects of interest')
plt.ylabel('e-learners')
plt.show()
# We will use the describe() method and calculate te mode and the median of the data.
Survey_2017_c['NumberOfIntrest'].describe()
count 6992.000000 mean 3.232265 std 2.143700 min 1.000000 25% 1.000000 50% 3.000000 75% 5.000000 max 13.000000 Name: NumberOfIntrest, dtype: float64
Survey_2017_c['NumberOfIntrest'].mode()
0 1 dtype: int64
Survey_2017_c['NumberOfIntrest'].median()
3.0
# Let's add to the dataframe the columns JobInterestList containing the list of desired subject per learner
Survey_2017_c['JobInterestList']=Survey_2017_c['JobRoleInterest'].apply(lambda x: x.split(","))
/dataquest/system/env/python3/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Survey_2017_c['JobInterestList'].head(15)
1 [Full-Stack Web Developer] 2 [ Front-End Web Developer, Back-End Web Deve... 3 [ Front-End Web Developer, Full-Stack Web De... 4 [Full-Stack Web Developer, Information Securi... 6 [Full-Stack Web Developer] 9 [Full-Stack Web Developer, Quality Assuranc... 11 [ DevOps / SysAdmin, Data Scientist, Info... 13 [Back-End Web Developer, Full-Stack Web Devel... 14 [Full-Stack Web Developer] 15 [Full-Stack Web Developer] 16 [Full-Stack Web Developer] 18 [Full-Stack Web Developer, Front-End Web De... 19 [ Front-End Web Developer, Mobile Develope... 21 [Information Security] 22 [Full-Stack Web Developer] Name: JobInterestList, dtype: object
# Let's make a dictionary to find out the frequency o each job and find the most
# interesting jobs.
InterestingJobs = {}
for jobL in Survey_2017_c['JobInterestList']:
for job in jobL:
job = job.strip()
if (job in InterestingJobs):
InterestingJobs[job] += 1
else:
InterestingJobs[job] = 1
InterestingJobs
{'AI': 1, 'AI Developer': 1, 'AI Engineer': 1, 'AI and Machine Learning': 1, 'AI and neuroscience': 1, 'All - whatever is required to develop tools to revolutionize the mechanical engineering process': 1, 'Analyst': 1, 'Any of them.': 1, 'Anything that engages me': 1, 'Application Support Analyst': 1, 'Artificial Intelligence': 4, 'Artificial Intelligence Engineer': 1, 'Artificial Intelligence engineer': 1, 'Artificial intelligence': 1, 'Astrophysicist': 1, 'BA or developer': 1, 'Back-End Web Developer': 2772, 'Bioinformatics': 2, 'Bioinformatics/science': 1, 'Bioinformatitian': 1, 'Bitcoin/Crypto': 1, 'Business Analyst': 2, 'Campaign Manager': 1, 'Ceo': 1, 'Cloud computing': 1, 'College professor': 1, 'Compilers': 1, 'Computer Architect': 1, 'Criminal Defense Attorney-- focusing on cyber crimes': 1, 'Cybersecurity': 1, 'Data Analyst': 2, 'Data Engineer': 1248, 'Data Reporter': 1, 'Data Scientist': 1643, 'Data Visualization Specialist': 1, 'Data analyst': 1, 'Data visualisation': 1, 'Data/Interactive Journalist': 1, 'Databases': 1, 'Desings': 1, 'Desktop Application Developer': 3, 'Desktop Applications': 1, 'Desktop Applications Programmer': 1, 'Desktop applications developer': 1, 'DevOps / SysAdmin': 927, 'Developer Evangelist': 1, 'Digital Humanitites': 1, 'Document Controller': 1, "Don't know yet": 1, 'Education': 1, 'Embedded Developer': 1, 'Embedded hardware': 1, 'Entrepreneur': 1, 'Entreprenuer / Web Dev Hustler': 1, 'Ethical Hacker': 2, 'Financial Services': 1, 'Founder': 1, 'Front End Web Development': 1, 'Front end': 1, 'Front-End Web Designer': 1, 'Front-End Web Developer': 3533, 'Full Stack Developer': 1, 'Full Stack Software Engineer': 1, 'Full-Stack Web Developer': 4198, 'GIS Database Admin': 1, 'GIS Developer': 1, 'Game Developer': 1628, 'Growth Hacker': 1, 'I am interested in Game Development': 1, "I don't know yet!": 1, 'I dont yet know': 1, "I'm just learning code to increase my skill-set. I see it as a literacy issue.": 1, 'IT specialist': 1, 'Informatician': 1, 'Information Architect': 1, 'Information Developer': 1, 'Information Security': 1326, 'Information Technology': 1, 'Infrastructure Architect': 1, 'IoT': 1, 'IoT Developer': 1, 'Java developer': 2, 'Journalist': 1, 'Journalist/Graphic Designer/Marketing': 1, 'Library Developer': 1, 'Machine Learning': 2, 'Machine Learning Engineer': 3, 'Machine learning and AI': 1, 'Machine learning engineer': 1, 'Marketing': 1, 'Marketing Automation': 1, 'Mobile Developer': 2304, 'Mobile Development': 1, 'Natural Language Processing': 1, 'Network': 1, 'Network Engineer': 1, 'Networking': 1, 'Non technical': 1, 'Not Sure': 1, 'Not Sure Yet': 1, 'Not sure': 2, 'Not sure yet': 2, 'Not sure!': 1, 'Operating Systems': 1, 'Pharmaceutical industry': 1, 'Pharmacy tech': 1, 'Physicist': 1, 'Product Designer': 2, 'Product Manager': 814, 'Program Manager': 1, 'Programmer': 2, 'Programming': 1, 'Project Management': 1, 'Project Manager': 2, 'Project manager': 1, 'Python Developer': 1, 'Quality Assurance Engineer': 498, 'Quant (Algorithmic Trader)': 1, 'Real-time systems': 1, 'Remote Support': 1, 'Research': 2, 'Research and education': 1, 'Researcher': 2, 'Robotics': 1, 'Robotics Process Automation Specialist': 1, 'Robotics and AI Engineer': 1, 'SEO': 1, 'SWE': 1, 'Scientific Programming': 1, 'Security Business Analyst': 1, 'Software Developer': 6, 'Software Developer or Front-End Web Developer': 1, 'Software Development': 1, 'Software Developper': 1, 'Software Engineer': 11, 'Software Engineer (Computer Science Based)': 1, 'Software Engineering': 1, 'Software Engineers': 1, 'Software Projects Manager': 1, 'Software Specialist': 1, 'Software developer': 1, 'Software engineer': 4, 'Software enginner': 1, 'Support Engineer or API Support': 1, 'System Administrator/Network': 1, 'System Engineer': 1, 'System Software': 1, 'Systems Engineer': 1, 'Systems Programmer': 1, 'Systems Programming': 1, 'Teacher': 1, 'Teacher. Teaching students to code.': 1, 'Tech art': 1, 'Tech lobbiest': 1, 'Technical Writer': 1, 'Technology Management': 1, 'Technology-Business Liaison': 1, "This futurist's dream of using some tech in a way that inspires critical amounts of people to influence the changes we need to protect": 1, 'UI Design': 1, 'UI Designer': 1, 'UX developer/designer': 1, 'Unsure': 2, 'Urban Planner': 1, 'User Experience Designer': 1469, 'User Interface Design': 1, 'User Interface Designer': 2, 'VR Technology developer': 1, 'Web Design': 1, 'Web Designer': 2, 'Web developer': 1, 'Web development': 1, 'a job in which I can use coding skills to create valuable portals to advance human rights': 1, 'app dev etc.': 1, 'back end': 1, 'back-end': 1, 'code developer...in whatever format': 1, 'computer engineer': 1, 'creative coder / generative artist/designer': 1, 'data analyst': 2, 'data journalist / data visualist': 1, 'designer': 1, 'developer': 1, 'email coder': 1, 'etc...': 1, 'front-end': 1, 'full stack developer': 1, 'full-stack developer': 1, 'game': 1, "i don't know what the difference is between most of these soz lol": 1, 'i dunno!!!!': 1, 'idk': 1, 'improving in my current career as a Learning technologist': 1, 'lab scientist': 1, 'milatary engineer': 1, 'mobile developer': 1, 'network admin': 1, 'non-programmer': 1, 'philosopher': 1, 'plc': 1, 'programmer': 2, 'security expert': 1, 'software developer': 1, 'software engineer': 1, 'support scientific resaerch': 1, 'undeceided': 1, 'undecided': 2, 'virtual reality developer': 1, 'web': 1}
# We are going to sort the dictionary
sorted_InterestingJobs = sorted(InterestingJobs.items(), key=lambda kv: kv[1])
#sorted_InterestingJobs = sorted_InterestingJobs.reverse()
sorted_InterestingJobs
[("I'm just learning code to increase my skill-set. I see it as a literacy issue.", 1), ('Systems Programming', 1), ('Machine learning engineer', 1), ('Education', 1), ('SEO', 1), ('Bitcoin/Crypto', 1), ('AI and neuroscience', 1), ('AI and Machine Learning', 1), ('Software Developer or Front-End Web Developer', 1), ('Computer Architect', 1), ('Software Development', 1), ('Research and education', 1), ('AI Engineer', 1), ('Journalist/Graphic Designer/Marketing', 1), ('Entreprenuer / Web Dev Hustler', 1), ('Teacher. Teaching students to code.', 1), ('Anything that engages me', 1), ('Project manager', 1), ('Tech art', 1), ('Data/Interactive Journalist', 1), ('GIS Developer', 1), ('Embedded Developer', 1), ('System Engineer', 1), ('Software developer', 1), ('virtual reality developer', 1), ('plc', 1), ('Systems Programmer', 1), ('a job in which I can use coding skills to create valuable portals to advance human rights', 1), ('AI Developer', 1), ('UX developer/designer', 1), ('Non technical', 1), ('support scientific resaerch', 1), ('UI Designer', 1), ('email coder', 1), ('Not Sure Yet', 1), ('Software Engineers', 1), ('Desings', 1), ('Cybersecurity', 1), ('BA or developer', 1), ('full stack developer', 1), ('Technology-Business Liaison', 1), ('Software Specialist', 1), ('Any of them.', 1), ('Web developer', 1), ('software developer', 1), ('Robotics', 1), ('Informatician', 1), ('Software Developper', 1), ('Ceo', 1), ('Software enginner', 1), ('IoT', 1), ('app dev etc.', 1), ('Software Engineering', 1), ('web', 1), ("i don't know what the difference is between most of these soz lol", 1), ('Data Visualization Specialist', 1), ('undeceided', 1), ('Machine learning and AI', 1), ('Systems Engineer', 1), ('Full Stack Developer', 1), ('etc...', 1), ('System Administrator/Network', 1), ('Software Engineer (Computer Science Based)', 1), ('Founder', 1), ('designer', 1), ('AI', 1), ('Artificial Intelligence engineer', 1), ('SWE', 1), ('All - whatever is required to develop tools to revolutionize the mechanical engineering process', 1), ('data journalist / data visualist', 1), ('Natural Language Processing', 1), ('Document Controller', 1), ('back-end', 1), ('College professor', 1), ('Data analyst', 1), ('Library Developer', 1), ('I dont yet know', 1), ('Support Engineer or API Support', 1), ('software engineer', 1), ('Embedded hardware', 1), ('Pharmaceutical industry', 1), ('Pharmacy tech', 1), ('Scientific Programming', 1), ("I don't know yet!", 1), ('Cloud computing', 1), ('milatary engineer', 1), ('computer engineer', 1), ('Urban Planner', 1), ('Python Developer', 1), ('Application Support Analyst', 1), ('Desktop Applications Programmer', 1), ('Information Developer', 1), ('IoT Developer', 1), ('Technical Writer', 1), ('System Software', 1), ('Artificial intelligence', 1), ('Information Architect', 1), ('mobile developer', 1), ('Campaign Manager', 1), ('security expert', 1), ('Marketing Automation', 1), ('Compilers', 1), ('full-stack developer', 1), ('Project Management', 1), ('Data Reporter', 1), ('Desktop Applications', 1), ('User Interface Design', 1), ('Physicist', 1), ('Astrophysicist', 1), ('back end', 1), ('Journalist', 1), ('lab scientist', 1), ('Program Manager', 1), ('network admin', 1), ('Bioinformatitian', 1), ('UI Design', 1), ('Criminal Defense Attorney-- focusing on cyber crimes', 1), ('improving in my current career as a Learning technologist', 1), ('Mobile Development', 1), ('Bioinformatics/science', 1), ('Front end', 1), ('philosopher', 1), ('Full Stack Software Engineer', 1), ("This futurist's dream of using some tech in a way that inspires critical amounts of people to influence the changes we need to protect", 1), ('IT specialist', 1), ('Security Business Analyst', 1), ('Infrastructure Architect', 1), ('Financial Services', 1), ('Technology Management', 1), ('Entrepreneur', 1), ('Information Technology', 1), ('Robotics and AI Engineer', 1), ('Databases', 1), ('Web development', 1), ('Digital Humanitites', 1), ('Not sure!', 1), ('Developer Evangelist', 1), ('Network Engineer', 1), ('Teacher', 1), ('code developer...in whatever format', 1), ('front-end', 1), ('Not Sure', 1), ('Networking', 1), ('GIS Database Admin', 1), ('Front-End Web Designer', 1), ('Real-time systems', 1), ('Artificial Intelligence Engineer', 1), ("Don't know yet", 1), ('Remote Support', 1), ('Tech lobbiest', 1), ('developer', 1), ('Software Projects Manager', 1), ('Marketing', 1), ('Programming', 1), ('game', 1), ('Desktop applications developer', 1), ('non-programmer', 1), ('Web Design', 1), ('Network', 1), ('Data visualisation', 1), ('VR Technology developer', 1), ('Operating Systems', 1), ('Front End Web Development', 1), ('Robotics Process Automation Specialist', 1), ('Analyst', 1), ('idk', 1), ('creative coder / generative artist/designer', 1), ('Quant (Algorithmic Trader)', 1), ('Growth Hacker', 1), ('I am interested in Game Development', 1), ('i dunno!!!!', 1), ('Research', 2), ('Not sure yet', 2), ('Ethical Hacker', 2), ('Researcher', 2), ('Not sure', 2), ('Project Manager', 2), ('Machine Learning', 2), ('Bioinformatics', 2), ('data analyst', 2), ('Data Analyst', 2), ('Business Analyst', 2), ('undecided', 2), ('Unsure', 2), ('programmer', 2), ('Web Designer', 2), ('Programmer', 2), ('User Interface Designer', 2), ('Java developer', 2), ('Product Designer', 2), ('Machine Learning Engineer', 3), ('Desktop Application Developer', 3), ('Software engineer', 4), ('Artificial Intelligence', 4), ('Software Developer', 6), ('Software Engineer', 11), ('Quality Assurance Engineer', 498), ('Product Manager', 814), ('DevOps / SysAdmin', 927), ('Data Engineer', 1248), ('Information Security', 1326), ('User Experience Designer', 1469), ('Game Developer', 1628), ('Data Scientist', 1643), ('Mobile Developer', 2304), ('Back-End Web Developer', 2772), ('Front-End Web Developer', 3533), ('Full-Stack Web Developer', 4198)]
# We look first at the frequency of citizenchip of learners
Survey_2017_c['CountryCitizen'].value_counts()
United States of America 2940 India 612 United Kingdom 262 Canada 229 Brazil 152 Poland 152 Russia 113 Ukraine 107 Germany 100 Nigeria 100 Australia 96 Romania 91 France 85 Spain 81 Philippines 67 Italy 64 Netherlands (Holland, Europe) 60 Mexico 55 Serbia 54 Greece 48 China 45 Hungary 42 South Africa 41 Indonesia 41 Turkey 38 Pakistan 37 Bangladesh 35 Croatia 35 Egypt 34 Argentina 33 ... Cyprus 1 Nicaragua 1 Gibraltar 1 Guyana 1 Benin 1 Norfolk Island 1 Channel Islands 1 Rwanda 1 Dominica 1 Somalia 1 American Samoa 1 Guatemala 1 Aruba 1 Bolivia 1 Jordan 1 United Arab Emirates 1 Nambia 1 Guam 1 Angola 1 Cote D'Ivoire 1 Mozambique 1 Togo 1 Cayman Islands 1 El Salvador 1 Bahrain 1 Honduras 1 Lesotho 1 Botswana 1 Sudan 1 Myanmar 1 Name: CountryCitizen, Length: 147, dtype: int64
# We look then at the frequency of the country of residence of the learners
Survey_2017_c['CountryLive'].value_counts()
United States of America 3125 India 528 United Kingdom 315 Canada 260 Poland 131 Brazil 129 Germany 125 Australia 112 Russia 102 Ukraine 89 Nigeria 84 Spain 77 France 75 Romania 71 Netherlands (Holland, Europe) 65 Italy 62 Philippines 52 Serbia 52 Greece 46 Ireland 43 South Africa 39 Mexico 37 Turkey 36 Singapore 34 Hungary 34 New Zealand 33 Croatia 32 Argentina 32 Norway 31 Indonesia 31 ... Aruba 1 Samoa 1 Papua New Guinea 1 Panama 1 Turkmenistan 1 Cameroon 1 Trinidad & Tobago 1 Gambia 1 Cuba 1 Nicaragua 1 Guatemala 1 Bolivia 1 Vanuatu 1 Guadeloupe 1 Channel Islands 1 Mozambique 1 Somalia 1 Kyrgyzstan 1 Anguilla 1 Yemen 1 Angola 1 Nambia 1 Liberia 1 Cayman Islands 1 Jordan 1 Botswana 1 Gibraltar 1 Sudan 1 Rwanda 1 Qatar 1 Name: CountryLive, Length: 137, dtype: int64
# We look then at the frequency of the country of residence of the learners (in percentage)
(Survey_2017_c['CountryLive'].value_counts()/len(Survey_2017_c['CountryLive']))*100
United States of America 44.693936 India 7.551487 United Kingdom 4.505149 Canada 3.718535 Poland 1.873570 Brazil 1.844966 Germany 1.787757 Australia 1.601831 Russia 1.458810 Ukraine 1.272883 Nigeria 1.201373 Spain 1.101259 France 1.072654 Romania 1.015446 Netherlands (Holland, Europe) 0.929634 Italy 0.886728 Philippines 0.743707 Serbia 0.743707 Greece 0.657895 Ireland 0.614989 South Africa 0.557780 Mexico 0.529176 Turkey 0.514874 Singapore 0.486270 Hungary 0.486270 New Zealand 0.471968 Croatia 0.457666 Argentina 0.457666 Norway 0.443364 Indonesia 0.443364 ... Aruba 0.014302 Samoa 0.014302 Papua New Guinea 0.014302 Panama 0.014302 Turkmenistan 0.014302 Cameroon 0.014302 Trinidad & Tobago 0.014302 Gambia 0.014302 Cuba 0.014302 Nicaragua 0.014302 Guatemala 0.014302 Bolivia 0.014302 Vanuatu 0.014302 Guadeloupe 0.014302 Channel Islands 0.014302 Mozambique 0.014302 Somalia 0.014302 Kyrgyzstan 0.014302 Anguilla 0.014302 Yemen 0.014302 Angola 0.014302 Nambia 0.014302 Liberia 0.014302 Cayman Islands 0.014302 Jordan 0.014302 Botswana 0.014302 Gibraltar 0.014302 Sudan 0.014302 Rwanda 0.014302 Qatar 0.014302 Name: CountryLive, Length: 137, dtype: float64
. Based on these data higest number of learners (over 70%) live in 10 countries in that order: United States of America, India, United Kingdom, Canada, Poland, Brazil, Germany, Australia, Russia, Ukraine
# Let's make the list of 10 countries of residence that have 70% of the learners
listCountries = ['United States of America','India','United Kingdom','Canada','Poland','Brazil','Germany','Australia','Russia','Ukraine']
# We are going to work with the data from these 10 countries
Survey_2017_10country = Survey_2017_c[Survey_2017_c['CountryLive'].isin(listCountries)]
# Let's look at the frequency table of the money spent for learning
Survey_2017_10country.MoneyForLearning.value_counts().sort_index()
0.0 2218 1.0 3 2.0 2 3.0 1 4.0 1 5.0 12 6.0 1 8.0 3 9.0 1 10.0 39 12.0 1 13.0 1 14.0 1 15.0 36 18.0 2 19.0 2 20.0 94 22.0 3 23.0 1 24.0 1 25.0 23 28.0 1 29.0 2 30.0 60 35.0 11 36.0 1 39.0 2 40.0 38 42.0 2 45.0 7 ... 12500.0 2 13000.0 6 13900.0 1 14000.0 9 14023.0 1 15000.0 16 15100.0 1 16000.0 8 16500.0 1 17000.0 2 18000.0 5 19000.0 2 20000.0 19 21000.0 1 22000.0 1 24500.0 1 25000.0 6 26000.0 1 30000.0 2 35000.0 1 40000.0 1 45000.0 1 50000.0 1 65000.0 1 69000.0 1 70000.0 1 80000.0 1 100000.0 2 120000.0 1 200000.0 1 Name: MoneyForLearning, Length: 177, dtype: int64
# We are making a correction in the column MonthsProgramming by converting all 0.0 to 1.0
Survey_2017_10country['MonthsProgramming'][Survey_2017_10country['MonthsProgramming']==0.0]=1.0
/dataquest/system/env/python3/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /dataquest/system/env/python3/lib/python3.4/site-packages/pandas/core/generic.py:5984: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /dataquest/system/env/python3/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2885: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# The nan values show that we need to clean the dataframe
Survey_2017_10country['MonthsProgramming'].unique()
array([ 6., 5., 12., 1., nan, 9., 14., 28., 3., 2., 25., 8., 60., 18., 19., 36., 24., 40., 20., 4., 48., 16., 26., 13., 10., 11., 30., 29., 15., 72., 7., 17., 54., 84., 240., 42., 50., 34., 55., 59., 70., 44., 49., 21., 96., 45., 32., 108., 39., 120., 31., 43., 33., 52., 41., 23., 27., 200., 180., 100., 360., 113., 144., 73., 83., 190., 35., 38., 80., 744., 156., 22., 171., 37., 198., 58., 480., 111., 408., 300., 89.])
# We are cleaning the dataframe by rmoving nan from MonthsProgramming and MoneyForLearning
Survey_2017_10country = Survey_2017_10country[Survey_2017_10country['MoneyForLearning'].notna()]
Survey_2017_10country = Survey_2017_10country[Survey_2017_10country['MonthsProgramming'].notna()]
# We are calculating and adding the column MoneySpentPrMonth
Survey_2017_10country['MoneySpentPrMonth'] = Survey_2017_10country['MoneyForLearning']/Survey_2017_10country['MonthsProgramming']
# Let's look at the frequency tables of the Money Spent Per Month
Survey_2017_10country['MoneySpentPrMonth'].value_counts().sort_values()
5333.333333 1 12500.000000 1 941.176471 1 2400.000000 1 347.222222 1 1571.428571 1 3.900000 1 2.187500 1 1714.285714 1 54.166667 1 13.636364 1 310.344828 1 3125.000000 1 0.208333 1 5666.666667 1 3166.666667 1 2337.166667 1 65.000000 1 428.571429 1 0.750000 1 11.000000 1 48.000000 1 114.285714 1 18.823529 1 482.758621 1 170.000000 1 607.142857 1 388.888889 1 110.000000 1 13.500000 1 ... 1.666667 18 30.000000 20 6.666667 21 333.333333 21 5.555556 22 1000.000000 25 125.000000 26 11.111111 27 3.333333 28 2.500000 28 20.833333 30 41.666667 30 250.000000 32 500.000000 36 200.000000 38 66.666667 41 5.000000 42 4.166667 47 166.666667 52 10.000000 55 12.500000 55 83.333333 57 20.000000 66 33.333333 75 8.333333 76 25.000000 83 50.000000 87 16.666667 89 100.000000 89 0.000000 2157 Name: MoneySpentPrMonth, Length: 415, dtype: int64
# total spending per month and per country
Survey_2017_10country.groupby('CountryLive')['MoneySpentPrMonth'].sum().nlargest(10)
CountryLive United States of America 668718.121808 India 62551.754651 Australia 42765.382132 Canada 27242.630637 Russia 22484.607143 United Kingdom 12704.109639 Poland 9232.830087 Brazil 6242.573651 Germany 5937.525000 Ukraine 4290.875053 Name: MoneySpentPrMonth, dtype: float64
# average spending per month and per country
Survey_2017_10country.groupby('CountryLive')['MoneySpentPrMonth'].mean().nlargest(10)
CountryLive Australia 419.268452 Russia 284.615280 United States of America 227.997996 India 135.100982 Canada 113.510961 Poland 75.678935 Brazil 56.239402 Ukraine 54.314874 Germany 52.083553 United Kingdom 45.534443 Name: MoneySpentPrMonth, dtype: float64
# standard deviation of spending per month and per country
Survey_2017_10country.groupby('CountryLive')['MoneySpentPrMonth'].std()
CountryLive Australia 2209.244869 Brazil 229.344918 Canada 441.014158 Germany 175.196725 India 692.960378 Poland 308.766078 Russia 2248.206469 Ukraine 144.997388 United Kingdom 162.311836 United States of America 1940.245614 Name: MoneySpentPrMonth, dtype: float64
# Let's plot these data using a boxplot
Survey_2017_10country.boxplot(column='MoneySpentPrMonth',by='CountryLive', figsize = (12,9))
plt.suptitle('')
plt.title('Money spent for courses per month per country')
plt.ylim(0,50500)
plt.xticks(rotation=60)
plt.ylabel('Monthly spenfing in $')
plt.xlabel('Country of residence')
plt.show()
# removing all the outliers more or equal to 10000$
Survey_2017_10countryWo=Survey_2017_10country[Survey_2017_10country['MoneySpentPrMonth']<10000]
# Let's plot the corrected data using a boxplot
Survey_2017_10countryWo.boxplot(column='MoneySpentPrMonth',by='CountryLive', figsize=(12,9))
plt.suptitle('')
plt.title('Money spent for courses per month per country')
#plt.ylim(0,7000)
plt.xticks(rotation=60)
plt.ylabel('Monthly spending in $')
plt.xlabel('Country of residence')
plt.show()
# Let's compute the avrage spending per month and per country after removing the outliers
Survey_2017_10countryWo.groupby('CountryLive')['MoneySpentPrMonth'].mean().nlargest(10)
CountryLive Australia 225.399823 United States of America 155.459187 India 113.748387 Canada 113.510961 Poland 75.678935 Brazil 56.239402 Ukraine 54.314874 Germany 52.083553 United Kingdom 45.534443 Russia 31.853938 Name: MoneySpentPrMonth, dtype: float64