from IPython.display import Image
from IPython.display import clear_output
from IPython.display import FileLink, FileLinks
Start by doing today's quiz
Go to Canvas, Modules -> Day 4 -> Review Day 3
~20 minutes
row = 'sofa|2000|buy|Uppsala'
fields = row.split('|')
price = fields[1]
if price == 2000:
print('The price is a number!')
if price == '2000':
print('The price is a string!')
The price is a string!
print(sorted([ 2000, 30, 100 ]))
[30, 100, 2000]
print(sorted(['2000', '30', '100']))
['100', '2000', '30']
ord('3')
Each type store a specific type of information
int
for integers,float
for floating point values (decimals),str
for strings,list
for lists,dict
for dictionaries.Each type supports different operations, functions and methods.
30 > 2000
False
'30' > '2000'
True
30 > int('2000')
'12345'[2]
12345[2]
max('2000')
'2'
max(2000)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[7], line 1 ----> 1 max(2000) TypeError: 'int' object is not iterable
import math
math.cos(3.14)
math.cos('3.14')
'ACTG'.lower()
'actg'
[1, 2, 3].lower()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[9], line 1 ----> 1 [1, 2, 3].lower() AttributeError: 'list' object has no attribute 'lower'
set([]).add('tiger')
[].add('tiger')
dir()
dir('ACTG') # list all attributes
dir(str) # list all attributes
float('2000')
2000.0
float('0.5')
0.5
float('1e9')
1000000000.0
float('1e-2')
int('2000')
int('1.5')
int('1e9')
1
, 0
, '1'
, '0'
, ''
, {}
¶bool(1)
bool(0)
bool('0')
bool('')
bool([])
bool({})
values = [1, 0, '', '0', '1', [], [0]]
for x in values:
if x:
print(repr(x), 'is true!')
else:
print(repr(x), 'is false!')
1 is true! 0 is false! '' is false! '0' is true! '1' is true! [] is false! [0] is true!
if x
is equivalent to if bool(x)
1
equivalent to True
?1 == True
x = 1
if x is True:
print(repr(x), 'is true!')
else:
print(repr(x), 'is false!')
x = 1
if bool(x) is True:
print(repr(x), 'is true!')
else:
print(repr(x), 'is false!')
if x is True
is not equivalent to if bool(x) is True
genre_list = ["comedy", "drama", "drama", "sci-fi"]
genre_list
['comedy', 'drama', 'drama', 'sci-fi']
genres = set(genre_list)
genres
{'comedy', 'drama', 'sci-fi'}
'drama' in genre_list
'drama' in genres
# which operation is faster?
True
genre_counts = {"comedy": 1, "drama": 2, "sci-fi": 1}
genre_counts
{'comedy': 1, 'drama': 2, 'sci-fi': 1}
movie = {"rating": 10.0, "title": "Toy Story"}
movie
{'rating': 10.0, 'title': 'Toy Story'}
def echo(message): # starts a new function definition
# this function echos the message
print(message) # print state of the variable
return message # return the value to end the function
list("hello")
['h', 'e', 'l', 'l', 'o']
str(['h', 'e', 'l', 'l', 'o'])
'_'.join('hello')
'h_e_l_l_o'
return
statementHOST = 'global'
def show_host():
print(f'HOST inside the function = {HOST}')
show_host()
print(f'HOST outside the function = {HOST}')
HOST inside the function = global HOST outside the function = global
HOST = 'global'
def change_host():
HOST = 'local'
print(f'HOST inside the function = {HOST}')
def app2():
print(HOST)
print(f'HOST outside the function before change = {HOST}')
change_host()
print(f'HOST outside the function after change = {HOST}')
app2()
HOST outside the function before change = global HOST inside the function = local HOST outside the function after change = global global
HOST = 'global'
def change_host(HOST):
HOST = 'local'
print(f'HOST inside the function = {HOST}')
print(f'HOST outside the function before change = {HOST}')
change_host(HOST)
print(f'HOST outside the function after change = {HOST}')
List as global variables
MOVIES = ['Toy story', 'Home alone']
def change_movie():
MOVIES = ['Fargo', 'The Usual Suspects']
print(f'MOVIES inside the function = {MOVIES}')
print(f'MOVIES outside the function before change = {MOVIES}')
change_movie()
print(f'MOVIES outside the function after change = {MOVIES}')
MOVIES = ['Toy story', 'Home alone']
def change_movie():
MOVIES.extend(['Fargo', 'The Usual Suspects'])
print(f'MOVIES inside the function = {MOVIES}')
print(f'MOVIES outside the function before change = {MOVIES}')
change_movie()
print(f'MOVIES outside the function after change = {MOVIES}')
MOVIES outside the function before change = ['Toy story', 'Home alone'] MOVIES inside the function = ['Toy story', 'Home alone', 'Fargo', 'The Usual Suspects'] MOVIES outside the function after change = ['Toy story', 'Home alone', 'Fargo', 'The Usual Suspects']
Take away: be careful when using global variables. Do not use it unless you know what you are doing.
return
statement¶A function that counts the number of occurences of 'C'
in the argument string.
def cytosine_count(nucleotides):
count = 0
for x in nucleotides:
if x == 'c' or x == 'C':
count += 1
return count
count1 = cytosine_count('CATATTAC')
count2 = cytosine_count('tagtag')
print(count1, "\n", count2)
2 0
Functions that return
are easier to repurpose than those that print
their result
cytosine_count('catattac') + cytosine_count('tactactac')
5
def print_cytosine_count(nucleotides):
count = 0
for x in nucleotides:
if x == 'c' or x == 'C':
count += 1
print(count)
print_cytosine_count('CATATTAC')
print_cytosine_count('tagtag')
2 0
print_cytosine_count('catattac') + print_cytosine_count('tactactac')
2 3
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[27], line 1 ----> 1 print_cytosine_count('catattac') + print_cytosine_count('tactactac') TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'
return
statement returns None
def foo():
do_nothing = 1
result = foo()
print(f'Return value of foo() = {result}')
Return value of foo() = None
return
for all values that you might want to use later in your programNone
¶return
statementNone
is None
, not anything elseNone == 0
False
None == False
False
None == ''
False
bool(None)
False
type(None)
NoneType
fh = open('../files/fruits.txt', mode='w', encoding='utf-8'); fh.close()
sorted([1, 4, 100, 5, 6], reverse=True)
[100, 6, 5, 4, 1]
record = 'gene_id INSR "insulin receptor"'
record.split(' ', 2)
['gene_id', 'INSR', '"insulin receptor"']
record.split(sep=' ', maxsplit=2)
['gene_id', 'INSR', '"insulin receptor"']
fh = open('../files/fruits.txt', mode='w', encoding='utf-8'); fh.close()
fh = open('../files/fruits.txt', encoding='utf-8', mode='w'); fh.close()
fh = open('../files/fruits.txt', 'w', encoding='utf-8'); fh.close()
fh = open('../files/fruits.txt', mode='w', encoding='utf-8'); fh.close()
fh = open('files/recipes.txt', encoding='utf-8', 'w'); fh.close()
Cell In[42], line 1 fh = open('files/recipes.txt', encoding='utf-8', 'w'); fh.close() ^ SyntaxError: positional argument follows keyword argument
sorted([1, 4, 100, 5, 6], reverse=True)
[100, 6, 5, 4, 1]
sorted([1, 4, 100, 5, 6], True)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[44], line 1 ----> 1 sorted([1, 4, 100, 5, 6], True) TypeError: sorted expected 1 argument, got 2
sorted(iterable, /, *, key=None, reverse=False)
/
must be specified with position*
must be specified with keyworddef format_sentence(subject, value = 13, end = "...."):
return 'The ' + subject + ' is ' + value + end
print(format_sentence('lecture', 'ongoing', '.'))
print(format_sentence('lecture', '!', value='ongoing'))
print(format_sentence(subject='lecture', value='ongoing', end='...'))
The lecture is ongoing.
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[45], line 6 2 return 'The ' + subject + ' is ' + value + end 4 print(format_sentence('lecture', 'ongoing', '.')) ----> 6 print(format_sentence('lecture', '!', value='ongoing')) 8 print(format_sentence(subject='lecture', value='ongoing', end='...')) TypeError: format_sentence() got multiple values for argument 'value'
def format_sentence(subject, value, end='.'):
return 'The ' + subject + ' is ' + value + end
#print(format_sentence('lecture', 'ongoing'))
print(format_sentence('lecture', 'ongoing', '...'))
The lecture is ongoing...
None
def format_sentence(subject, value, end='.', second_value=None):
if second_value is None:
return 'The ' + subject + ' is ' + value + end
else:
return 'The ' + subject + ' is ' + value + ' and ' + second_value + end
print(format_sentence('lecture', 'ongoing'))
print(format_sentence('lecture', 'ongoing', second_value='self-referential', end='!'))
The lecture is ongoing. The lecture is ongoing and self-referential!
None
¶None
to the other false values such as 0
, False
and ''
use is None
:counts = {'drama': 2, 'romance': 0}
counts.get('romance'), counts.get('thriller')
counts.get('romance') is None
counts.get('thriller') is None
values = [None, 1, 0, '', '0', '1', [], [0]]
for x in values:
if x is None:
print(repr(x), 'is None')
if not x:
print(repr(x), 'is false')
if x:
print(repr(x), 'is true')
Notebook Day_4_Exercise_1 (~30 minutes)
Go to Canvas, Modules -> Day 4 -> Exercise 1 - day 4
Extra reading:
import sys
sys.argv[1]
'-f'
from datetime import datetime
print(datetime.now())
2024-11-14 14:43:47.521944
import os
os.system("ls")
pip install
or conda install
,
text = 'Programming,is,cool'
help(text.split)
Help on built-in function split: split(sep=None, maxsplit=-1) method of builtins.str instance Return a list of the words in the string, using sep as the delimiter string. sep The delimiter according which to split the string. None (the default value) means split according to any whitespace, and discard empty strings from the result. maxsplit Maximum number of splits to do. -1 (the default value) means no limit.
text.split(sep=',')
['Programming', 'is', 'cool']
urllib
, given the URL https://www.python.org/static/img/python-logo@2x.pngimport urllib
help(urllib)
Help on package urllib: NAME urllib MODULE REFERENCE https://docs.python.org/3.9/library/urllib The following documentation is automatically generated from the Python source files. It may be incomplete, incorrect or include features that are considered implementation detail and may vary between Python implementations. When in doubt, consult the module reference at the location listed above. PACKAGE CONTENTS error parse request response robotparser FILE /Users/kostas/opt/miniconda3/envs/python-workshop-teacher/lib/python3.9/urllib/__init__.py
Using Python to download the Python logo from internet with urllib providing the url as https://www.python.org/static/img/python-logo@2x.png
import urllib.request
url = "https://www.python.org/static/img/python-logo@2x.png"
filename = "python-logo.png" # The name you want to give to the downloaded file
urllib.request.urlretrieve(url, filename)
print("Download completed.")
help(math.sqrt)
math.sqrt(3)
import math
math.sqrt(3)
import math as m
m.sqrt(3)
from math import sqrt
sqrt(3)
from pprint import pprint
def process_file(filename, chrom, pos):
"""
Read a very large vcf file, search for lines matching
chromosome chrom and position pos.
Print the genotypes of the matching lines.
"""
for line in open(filename):
if not line.startswith('#'):
col = line.split('\t')
if col[0] == chrom and int(col[1]) == pos:
print(col[9:])
help(process_file)
Help on function process_file in module __main__: process_file(filename, chrom, pos) Read a very large vcf file, search for lines matching chromosome chrom and position pos. Print the genotypes of the matching lines.
"""
What does this function do?
"""
# implementation details
At the beginning of the file
"""
This module provides functions for ...
"""
import random
def make_list(x):
"""Returns a random list of length x."""
li = list(range(x))
random.shuffle(li)
return li
my_list[5] += other_list[3] # explain why you do this!
from files import mywork
mywork.pipeline(["accctt", "gaccct"])
title = 'Toy Story'
rating = 10
print('The result is: ' + title + ' with rating: ' + str(rating))
# f-strings (since python 3.6)
print(f'The result is: {title} with rating: {rating}')
# format method
print('The result is: {} with rating: {}'.format(title, rating))
# the ancient way (python 2)
print('The result is: %s with rating: %s' % (title, rating))
Learn more from the Python docs: https://docs.python.org/3.9/library/string.html#format-string-syntax
pick_movie(year=1996, rating_min=8.5)
The Bandit
pick_movie(rating_max=8.0, genre="Mystery")
Twelve Monkeys
DataFrame
typeimport pandas as pd
data = {
'age': [1,2,3,4],
'circumference': [2,3,5,10],
'height': [30, 35, 40, 50]
}
df = pd.DataFrame(data)
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
# add row index
row_index = ["tree1", "tree2", "tree3", "tree4"]
df = df.set_index(pd.Index(row_index))
help(pd.Index)
pd.read_table
: tab separated values .tsv
pd.read_csv
: comma separated values .csv
pd.read_excel
: Excel spreadsheets .xlsx
For a data frame df
: df.to_table()
, df.to_csv()
, df.to_excel()
df = pd.read_table('../downloads/Orange_1.tsv')
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
age
, circumference
, height
df2 = pd.read_excel('../downloads/Orange_1.xlsx')
df2
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df.shape
(4, 3)
df.describe()
age | circumference | height | |
---|---|---|---|
count | 4.000000 | 4.000000 | 4.000000 |
mean | 2.500000 | 5.000000 | 38.750000 |
std | 1.290994 | 3.559026 | 8.539126 |
min | 1.000000 | 2.000000 | 30.000000 |
25% | 1.750000 | 2.750000 | 33.750000 |
50% | 2.500000 | 4.000000 | 37.500000 |
75% | 3.250000 | 6.250000 | 42.500000 |
max | 4.000000 | 10.000000 | 50.000000 |
df.max()
age 4 circumference 10 height 50 dtype: int64
dataframe.columnname
dataframe['columnname']
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df_new = df.age
df_new
0 1 1 2 2 3 3 4 Name: age, dtype: int64
df['age']
0 1 1 2 2 3 3 4 Name: age, dtype: int64
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df[['age', 'height']]
age | height | |
---|---|---|
0 | 1 | 30 |
1 | 2 | 35 |
2 | 3 | 40 |
3 | 4 | 50 |
df[['height', 'age']]
height | age | |
---|---|---|
0 | 30 | 1 |
1 | 35 | 2 |
2 | 40 | 3 |
3 | 50 | 4 |
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df.loc[0] # select the first row
age 1 circumference 2 height 30 Name: 0, dtype: int64
df.loc[1:3] # select from row 2 to 4
age | circumference | height | |
---|---|---|---|
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df.loc[[1, 3, 0]] # select row 2, 4 and 1
age | circumference | height | |
---|---|---|---|
1 | 2 | 3 | 35 |
3 | 4 | 10 | 50 |
0 | 1 | 2 | 30 |
df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df.loc[[0], ['age']]
age | |
---|---|
0 | 1 |
df[['age', 'circumference']].describe()
age | circumference | |
---|---|---|
count | 4.000000 | 4.000000 |
mean | 2.500000 | 5.000000 |
std | 1.290994 | 3.559026 |
min | 1.000000 | 2.000000 |
25% | 1.750000 | 2.750000 |
50% | 2.500000 | 4.000000 |
75% | 3.250000 | 6.250000 |
max | 4.000000 | 10.000000 |
df['age'].std()
1.2909944487358056
df.loc[1:10]
dataframe.iloc[index]
dataframe.iloc[start:stop]
Further reading from pandas documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html
df
#df.iloc[:,0] # Show the first column
#df.iloc[1] # Show the second row
df.iloc[1,0] # Show the cell of the second row and the first column (you get number without index)
2
import math
df['radius'] = df['circumference'] / (2.0 * math.pi)
df
age | circumference | height | radius | |
---|---|---|---|---|
0 | 1 | 2 | 30 | 0.318310 |
1 | 2 | 3 | 35 | 0.477465 |
2 | 3 | 5 | 40 | 0.795775 |
3 | 4 | 10 | 50 | 1.591549 |
df1 = pd.DataFrame({
'age': [1,2,3,4],
'circumference': [2,3,5,10],
'height': [30, 35, 40, 50]
})
df1
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
df2 = pd.DataFrame({
'name': ['palm', 'ada', 'ek', 'olive'],
'price': [1423, 2000, 102, 30]
})
df2
name | price | |
---|---|---|
0 | palm | 1423 |
1 | ada | 2000 |
2 | ek | 102 |
3 | olive | 30 |
pd.concat([df2, df1], axis=0).reset_index(drop=True)
name | price | age | circumference | height | |
---|---|---|---|---|---|
0 | palm | 1423.0 | NaN | NaN | NaN |
1 | ada | 2000.0 | NaN | NaN | NaN |
2 | ek | 102.0 | NaN | NaN | NaN |
3 | olive | 30.0 | NaN | NaN | NaN |
4 | NaN | NaN | 1.0 | 2.0 | 30.0 |
5 | NaN | NaN | 2.0 | 3.0 | 35.0 |
6 | NaN | NaN | 3.0 | 5.0 | 40.0 |
7 | NaN | NaN | 4.0 | 10.0 | 50.0 |
e.g.
df = pd.read_table('../downloads/Orange.tsv')
df.head(3) # can also use .head()
Tree | age | circumference | |
---|---|---|---|
0 | 1 | 118 | 30 |
1 | 1 | 484 | 58 |
2 | 1 | 664 | 87 |
df.Tree.unique()
array([1, 2, 3])
df[df['Tree'] == 1]
Tree | age | circumference | |
---|---|---|---|
0 | 1 | 118 | 30 |
1 | 1 | 484 | 58 |
2 | 1 | 664 | 87 |
3 | 1 | 1004 | 115 |
4 | 1 | 1231 | 120 |
5 | 1 | 1372 | 142 |
6 | 1 | 1582 | 145 |
df[df.age > 500]
Tree | age | circumference | |
---|---|---|---|
2 | 1 | 664 | 87 |
3 | 1 | 1004 | 115 |
4 | 1 | 1231 | 120 |
5 | 1 | 1372 | 142 |
6 | 1 | 1582 | 145 |
9 | 2 | 664 | 111 |
10 | 2 | 1004 | 156 |
11 | 2 | 1231 | 172 |
12 | 2 | 1372 | 203 |
13 | 2 | 1582 | 203 |
16 | 3 | 664 | 75 |
17 | 3 | 1004 | 108 |
18 | 3 | 1231 | 115 |
19 | 3 | 1372 | 139 |
20 | 3 | 1582 | 140 |
df[(df.age > 500) & (df.circumference < 100) ]
Tree | age | circumference | |
---|---|---|---|
2 | 1 | 664 | 87 |
16 | 3 | 664 | 75 |
type(pd.DataFrame({"genre": ['Thriller', 'Drama'], "rating": [10, 9]}).rating.iloc[0])
#young = df[df.age < 200]
#young
df[df.age < 1000]
df
max_c=df.circumference.max()
max_c
df[df.circumference==max_c]
Tree | age | circumference | |
---|---|---|---|
12 | 2 | 1372 | 203 |
13 | 2 | 1582 | 203 |
max_c = df.circumference.max()
print(max_c)
df[df.circumference == max_c]
df[(df.age > 100) & (df.age <= 250)]
Here's a dictionary of students and their grades:
students = {'student': ['bob', 'sam', 'joe'], 'grade': [1, 3, 4]}
Use Pandas to:
students = {'student': ['bob', 'sam', 'joe'], 'grade': [1, 3, 4]}
ds=pd.DataFrame(students)
ds.grade.mean()
2.6666666666666665
students = {'student': ['bob', 'sam', 'joe'], 'grade': [1, 3, 4]}
df = pd.DataFrame(students)
df.grade.mean()
# df['grade'].mean()
df.columnname.plot()
small_df = pd.read_table('../downloads/Orange_1.tsv')
small_df
age | circumference | height | |
---|---|---|---|
0 | 1 | 2 | 30 |
1 | 2 | 3 | 35 |
2 | 3 | 5 | 40 |
3 | 4 | 10 | 50 |
small_df.plot(x='age', y='circumference', kind='line') # plot the relationship of age and height
# try with other types of plots, e.g. scatter
<AxesSubplot:xlabel='age'>
import matplotlib.pyplot as plt
plt.show()
%matplotlib inline
small_df[['age']].plot(kind='bar')
<AxesSubplot:>
small_df[['circumference', 'age']].plot(kind='bar')
<AxesSubplot:>
df[['circumference', 'age']].plot(kind='bar', figsize=(12, 8), fontsize=16)
small_df.plot(kind='hist', y = 'age', fontsize=18)
<AxesSubplot:ylabel='Frequency'>
small_df.plot(kind='box', y = 'age')
<AxesSubplot:>
df.plot(kind="scatter", x="column_name", y="other_column_name")
df.plot(kind="scatter", x='age', y='circumference',
figsize=(12, 8), fontsize=14)
dataframe.plot(kind="line", x=..., y=...)
tree1 = df[df['Tree'] == 1]
tree1.plot(kind="line", x='age', y='circumference',
fontsize=14, figsize=(12,8))
df.groupby('Tree')
df.groupby('Tree').plot(kind="line", x='age', y='circumference')
df.groupby('Tree').groups
Modules -> Day 4 -> Exercise 2 - day 4
Orange_1.tsv