#!/usr/bin/env python # coding: utf-8 # *** # # 数据科学的编程工具 # > # Python使用简介 # *** # # 王成军 # # wangchengjun@nju.edu.cn # # 计算传播网 http://computational-communication.com # # 人生苦短,我用Python。 # # Python(/ˈpaɪθən/)是一种面向对象、解释型计算机程序设计语言 # - 由Guido van Rossum于1989年底发明 # - 第一个公开发行版发行于1991年 # - Python语法简洁而清晰 # - 具有强大的标准库和丰富的第三方模块 # - 它常被昵称为胶水语言 # - TIOBE编程语言排行榜“2010年度编程语言” # # # # 特点 # - 免费、功能强大、使用者众多 # - 与R和MATLAB相比,Python是一门更易学、更严谨的程序设计语言。使用Python编写的脚本更易于理解和维护。 # - 如同其它编程语言一样,Python语言的基础知识包括:类型、列表(list)和元组(tuple)、字典(dictionary)、条件、循环、异常处理等。 # - 关于这些,初阶读者可以阅读《Beginning Python》一书(Hetland, 2005)。 # # ## Python中包含了丰富的类库。 # 众多开源的科学计算软件包都提供了Python的调用接口,例如著名的计算机视觉库OpenCV。 # Python本身的科学计算类库发展也十分完善,例如NumPy、SciPy和matplotlib等。 # 就社会网络分析而言,igraph, networkx, graph-tool, Snap.py等类库提供了丰富的网络分析工具 # ## Python软件与IDE # 目前最新的Python版本为3.0,更稳定的2.7版本。 # 编译器是编写程序的重要工具。 # 免费的Python编译器有Spyder、PyCharm(免费社区版)、Ipython、Vim、 Emacs、 Eclipse(加上PyDev插件)。 # # # Installing Anaconda Python # - Use the Anaconda Python # - http://continuum.io/downloads.html # # 第三方包可以使用pip install的方法安装。 # - 可以点击ToolsOpen command prompt # - 然后在打开的命令窗口中输入: # - pip install beautifulsoup4 # # > pip install beautifulsoup4 # - NumPy /SciPy for scientific computing # - pandas to make Python usable for data analysis # - matplotlib to make graphics # - scikit-learn for machine learning # # In[90]: import random, datetime import numpy as np import pylab as plt import statsmodels.api as sm from scipy.stats import norm from scipy.stats.stats import pearsonr # # Variable Type # # In[93]: # str, int, float str(3) # In[193]: # int int('5') # In[192]: # float float('7.1') # In[194]: range(10) # In[190]: range(1, 10) # # dir & help # # 当你想要了解对象的详细信息时使用 # In[96]: dir # In[99]: dir(str)[-5:] # In[23]: help(str) # In[25]: x = ' Hello WorlD ' dir(x)[-10:] # In[111]: # lower x.lower() # In[112]: # upper x.upper() # In[113]: # rstrip x.rstrip() # In[115]: # strip x.strip() # In[26]: # replace x.replace('lo', '') # In[27]: # split x.split('lo') # In[28]: # join ','.join(['a', 'b']) # # type # 当你想要了解变量类型时使用type # In[100]: x = 'hello world' type(x) # # Data Structure # list, tuple, set, dictionary, array # # In[119]: l = [1,2,3,3] # list t = (1, 2, 3, 3) # tuple s = set([1,2,3,3]) # set d = {'a':1,'b':2,'c':3} # dict a = np.array(l) # array print l, t, s, d, a # In[182]: l = [1,2,3,3] # list l.append(4) l # In[1]: d = {'a':1,'b':2,'c':3} # dict d.keys() # In[2]: d = {'a':1,'b':2,'c':3} # dict d.values() # In[185]: d = {'a':1,'b':2,'c':3} # dict d['b'] # In[187]: d = {'a':1,'b':2,'c':3} # dict d.items() # # 定义函数 # In[4]: def devidePlus(m, n): # 结尾是冒号 y = float(m)/n+ 1 # 注意:空格 return y # 注意:return # ## For 循环 # In[188]: range(10) # In[189]: range(1, 10) # In[123]: for i in range(10): print i, i*10, i**2 # In[122]: for i in range(10): print i*10 # In[5]: for i in range(10): print devidePlus(i, 2) # In[121]: # 列表内部的for循环 r = [devidePlus(i, 2) for i in range(10)] r # # map # In[21]: map(devidePlus, [4,3,2], [2, 1, 5]) # 注意: 将(4, 2)作为一个组合进行计算,将(3, 1)作为一个组合进行计算 # In[20]: map(lambda x, y: x + y, [1, 3, 5, 7, 9], [2, 4, 6, 8, 10]) # In[22]: map(lambda x, y, z: x + y - z, [1, 3, 5, 7, 9], [2, 4, 6, 8, 10], [3, 3, 2, 2, 5]) # # if elif else # In[132]: j = 3 if j%2 == 1: print r'余数是1' elif j%2 ==2: print r'余数是2' else: print r'余数既不是1也不是2' # In[131]: x = 5 if x < 5: y = -1 z = 5 elif x > 5: y = 1 z = 11 else: y = 0 z = 10 print(x, y, z) # # while循环 # In[129]: j = 0 while j <10: print j j+=1 # avoid dead loop # In[128]: j = 0 while j <10: if j%2 != 0: print j**2 j+=1 # avoid dead loop # In[127]: j = 0 while j <50: if j == 30: break if j%2 != 0: print j**2 j+=1 # avoid dead loop # In[1]: a = 4 while a: print a a -= 1 if a < 0: a = None # [] # # try except # In[130]: for i in [2, 0, 5]: try: print devidePlus(4, i) except Exception, e: print e pass # In[1]: alist = [[1,1], [0, 0, 1]] for aa in alist: try: for a in aa: print 10 / a except Exception, e: print e pass # In[2]: alist = [[1,1], [0, 0, 1]] for aa in alist: for a in aa: try: print 10 / a except Exception, e: print e pass # # Write and Read data # In[225]: data =[[i, i**2, i**3] for i in range(10)] data # In[146]: for i in data: print '\t'.join(map(str, i)) # In[228]: type(data) # In[226]: len(data) # In[227]: data[0] # In[160]: # 保存数据 data =[[i, i**2, i**3] for i in range(10000)] f = open("/Users/chengjun/github/cjc2016/data/data_write_to_file.txt", "wb") for i in data: f.write('\t'.join(map(str,i)) + '\n') f.close() # In[162]: with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: data = f.readlines() data[:5] # In[176]: with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: data = f.readlines(1000) len(data) # In[177]: with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: print f.readline() # In[6]: f = [1, 2, 3, 4, 5] for k, i in enumerate(f): print k, i # In[9]: with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: for i in f: print i # In[181]: with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: for k, i in enumerate(f): if k%2000 ==0: print i # In[17]: data = [] line = '0\t0\t0\n' line = line.replace('\n', '') line = line.split('\t') line = [int(i) for i in line] # convert str to int data.append(line) data # In[11]: # 读取数据 data = [] with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: for line in f: #line = line.replace('\n', '').split('\t') #line = [int(i) for i in line] data.append(line) data # In[153]: # 读取数据 data = [] with open('/Users/chengjun/github/cjc2016/data/data_write_to_file.txt','r') as f: for line in f: line = line.replace('\n', '').split('\t') line = [int(i) for i in line] data.append(line) data # # 保存中间步骤产生的字典数据 # In[66]: import json data_dict = {'a':1, 'b':2, 'c':3} with open('/Users/chengjun/github/cjc2016/save_dict.json', 'w') as f: json.dump(data_dict, f) # In[67]: dd = json.load(open("/Users/chengjun/github/cjc2016/save_dict.json")) dd # ## 重新读入json # # 保存中间步骤产生的列表数据 # In[68]: data_list = range(10) with open('/Users/chengjun/github/cjc2016/save_list.json', 'w') as f: json.dump(data_list, f) # In[69]: dl = json.load(open("/Users/chengjun/github/cjc2016/save_list.json")) dl # # use dill to save data # In[85]: import dill # pip insstall dill # http://trac.mystic.cacr.caltech.edu/project/pathos/wiki/dill def myFunction(num): return num,num with open('/Users/chengjun/github/cjc2016/data.pkl', 'wb') as f: dill.dump(myFunction, f) # In[86]: with open('/Users/chengjun/github/cjc2016/data.pkl', 'r') as f: newFunction = dill.load(f)#, strictio=strictio)) newFunction('hello') # http://stackoverflow.com/questions/35603979/pickling-defaultdict-with-lambda # # 使用matplotlib绘图 # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt x = range(1, 100) y = [i**-3 for i in x] plt.plot(x, y, 'b-s') plt.ylabel('$p(k)$', fontsize = 20) plt.xlabel('$k$', fontsize = 20) plt.xscale('log') plt.yscale('log') plt.title('Degree Distribution') plt.show() # In[3]: import numpy as np # red dashes, blue squares and green triangles t = np.arange(0., 5., 0.2) plt.plot(t, t, 'r--') plt.plot(t, t**2, 'bs') plt.plot(t, t**3, 'g^') plt.show() # In[63]: # red dashes, blue squares and green triangles t = np.arange(0., 5., 0.2) plt.plot(t, t**2, 'b-s', label = '1') plt.plot(t, t**2.5, 'r-o', label = '2') plt.plot(t, t**3, 'g-^', label = '3') plt.annotate(r'$\alpha = 3$', xy=(3.5, 40), xytext=(2, 80), arrowprops=dict(facecolor='black', shrink=0.05), fontsize = 20) plt.ylabel('$f(t)$', fontsize = 20) plt.xlabel('$t$', fontsize = 20) plt.legend(loc=2,numpoints=1,fontsize=10) plt.show() # plt.savefig('/Users/chengjun/GitHub/cjc2016/figure/save_figure.png', # dpi = 300, bbox_inches="tight",transparent = True) # In[55]: plt.figure(1) plt.subplot(221) plt.plot(t, t, 'r--') plt.text(2, 0.8*np.max(t), r'$\alpha = 1$', fontsize = 20) plt.subplot(222) plt.plot(t, t**2, 'bs') plt.text(2, 0.8*np.max(t**2), r'$\alpha = 2$', fontsize = 20) plt.subplot(223) plt.plot(t, t**3, 'g^') plt.text(2, 0.8*np.max(t**3), r'$\alpha = 3$', fontsize = 20) plt.subplot(224) plt.plot(t, t**4, 'r-o') plt.text(2, 0.8*np.max(t**4), r'$\alpha = 4$', fontsize = 20) plt.show() # In[4]: def f(t): return np.exp(-t) * np.cos(2*np.pi*t) t1 = np.arange(0.0, 5.0, 0.1) t2 = np.arange(0.0, 5.0, 0.02) # In[5]: plt.figure(1) plt.subplot(211) plt.plot(t1, f(t1), 'bo') plt.plot(t2, f(t2), 'k') plt.subplot(212) plt.plot(t2, np.cos(2*np.pi*t2), 'r--') plt.show() # In[25]: import matplotlib.gridspec as gridspec t = np.arange(0., 5., 0.2) gs = gridspec.GridSpec(3, 3) ax1 = plt.subplot(gs[0, :]) plt.plot(t, t**2, 'b-s') ax2 = plt.subplot(gs[1,:-1]) plt.plot(t, t**2, 'g-s') ax3 = plt.subplot(gs[1:, -1]) plt.plot(t, t**2, 'r-o') ax4 = plt.subplot(gs[-1,0]) plt.plot(t, t**2, 'g-^') ax5 = plt.subplot(gs[-1,-2]) plt.plot(t, t**2, 'b-<') plt.tight_layout() # In[221]: def OLSRegressPlot(x,y,col,xlab,ylab): xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant, beta = res.params r2 = res.rsquared lab = r'$\beta = %.2f, \,R^2 = %.2f$' %(beta,r2) plt.scatter(x,y,s=60,facecolors='none', edgecolors=col) plt.plot(x,constant + x*beta,"red",label=lab) plt.legend(loc = 'upper left',fontsize=16) plt.xlabel(xlab,fontsize=26) plt.ylabel(ylab,fontsize=26) # In[222]: x = np.random.randn(50) y = np.random.randn(50) + 3*x pearsonr(x, y) fig = plt.figure(figsize=(10, 4),facecolor='white') OLSRegressPlot(x,y,'RoyalBlue',r'$x$',r'$y$') plt.show() # In[206]: fig = plt.figure(figsize=(7, 4),facecolor='white') data = norm.rvs(10.0, 2.5, size=5000) mu, std = norm.fit(data) plt.hist(data, bins=25, normed=True, alpha=0.6, color='g') xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'r', linewidth=2) title = r"$\mu = %.2f, \, \sigma = %.2f$" % (mu, std) plt.title(title,size=16) plt.show() # In[223]: from matplotlib.dates import WeekdayLocator, DayLocator, MONDAY, DateFormatter from matplotlib.finance import quotes_historical_yahoo_ochl, candlestick_ochl date1 = (2014, 2, 1) date2 = (2014, 5, 1) quotes = quotes_historical_yahoo_ochl('INTC', date1, date2) # In[224]: fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1,1,1) candlestick_ochl(ax, quotes, width=0.8, colorup='green', colordown='r', alpha=0.8) mondays = WeekdayLocator(MONDAY) # major ticks on the mondays alldays = DayLocator() # minor ticks on the days weekFormatter = DateFormatter('%b %d') # e.g., Jan 12 ax.xaxis.set_major_locator(mondays) ax.xaxis.set_minor_locator(alldays) ax.xaxis.set_major_formatter(weekFormatter) ax.autoscale_view() plt.setp( plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right') plt.title(r'$Intel \,Corporation \,Stock \,Price$',size=16) fig.subplots_adjust(bottom=0.2) plt.show() # # This is the end. # > Thank you for your attention.