lst = [1,2,4,5]
list(map(lambda x: 'lower' if x < 3 else 'higher', lst))
# %load_ext watermark
# %watermark -a "Sebastian Raschka" -u -d -v -p numpy,pandas,pyprind,matplotlib,nltk,sklearn
플래스크(Flask) 웹 애플리케이션 코드는 다음 디렉토리에 있습니다:
1st_flask_app_1/
: 간단한 플래스크 웹 애플리케이션1st_flask_app_2/
: 1st_flask_app_1
에 폼 검증과 렌더링을 추가하여 확장한 버전movieclassifier/
: 웹 애플리케이션에 내장한 영화 리뷰 분류기movieclassifier_with_update/
: movieclassifier
와 같지만 초기화를 위해 sqlite 데이터베이스를 사용합니다.웹 애플리케이션을 로컬에서 실행하려면 cd
로 (위에 나열된) 각 디렉토리에 들어가서 메인 애플리케이션 스크립트를 실행합니다.
cd ./1st_flask_app_1
python3 app.py
터미널에서 다음같은 내용일 출력됩니다.
* Running on http://127.0.0.1:5000/
* Restarting with reloader
웹 브라우저를 열고 터미널에 출력된 주소(일반적으로 http://127.0.0.1:5000/)%EB%A5%BC 입력하여 웹 애플리케이션에 접속합니다.
이 튜토리얼로 만든 예제 애플리케이션 데모는 다음 주소에서 볼 수 있습니다: http://haesun.pythonanywhere.com/.
이 절은 8장의 마지막 섹션에서 훈련한 로지스틱 회귀 모델을 다시 사용합니다. 이어지는 코드 블럭을 실행하여 다음 절에서 사용할 모델을 훈련시키겠습니다.
노트
다음 코드는 8장에서 만든 movie_data.csv
데이터셋을 사용합니다.
# # 압축파일을 풀어서 저장하는 함수
# import gzip
# with gzip.open('movie_data.csv.gz') as f_in, open('movie_data.csv', 'wb') as f_out:
# f_out.writelines(f_in)
# import nltk
# nltk.download('stopwords')
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop = stopwords.words('english')
porter = PorterStemmer()
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized
def stream_docs(path):
with open(path, 'r', encoding='utf-8') as csv:
next(csv) # skip header
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label
text_instnace = stream_docs(path='data/movie_data.csv')
next(text_instnace)
('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family used their influence to cover the murder for more than twenty years. However, a snoopy detective and convicted perjurer in disgrace was able to disclose how the hideous crime was committed. The screenplay shows the investigation of Mark and the last days of Martha in parallel, but there is a lack of the emotion in the dramatization. My vote is seven.<br /><br />Title (Brazil): Not Available"', 1)
# 반복 실행할때 마다 새로운 인스턴스 객체를 출력 합니다
next(text_instnace)
('"OK... so... I really like Kris Kristofferson and his usual easy going delivery of lines in his movies. Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly. But, Disappearance is his misstep. Holy Moly, this was a bad movie! <br /><br />I must give kudos to the cinematography and and the actors, including Kris, for trying their darndest to make sense from this goofy, confusing story! None of it made sense and Kris probably didn\'t understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about! <br /><br />I don\'t care that everyone on this movie was doing out of love for the project, or some such nonsense... I\'ve seen low budget movies that had a plot for goodness sake! This had none, zilcho, nada, zippo, empty of reason... a complete waste of good talent, scenery and celluloid! <br /><br />I rented this piece of garbage for a buck, and I want my money back! I want my 2 hours back I invested on this Grade F waste of my time! Don\'t watch this movie, or waste 1 minute of your valuable time while passing through a room where it\'s playing or even open up the case that is holding the DVD! Believe me, you\'ll thank me for the advice!"', 0)
def get_minibatch(doc_stream, size):
docs, y = [], []
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
# max_iter 를 설정시 tol=0.01 값을 0.21 ~ 0.01 사이의 값으로 추가를 해주면 warning이 사라짐
clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=0.01)
doc_stream = stream_docs(path='data/movie_data.csv')
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
pbar.update()
0% [##############################] 100% | ETA: 00:00:00 Total time elapsed: 00:00:22
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('정확도: %.3f' % clf.score(X_test, y_test))
정확도: 0.867
clf = clf.partial_fit(X_test, y_test)
앞에서 로지스틱 회귀 모델을 훈련한 후에 분류기, 불용어, 포터 어간 추출기, HashingVectorizer
를 로컬 디스크에 직렬화된 객체로 저장합니다. 나중에 웹 애플리케이션에서 학습된 분류기를 이용하겠습니다.
import pickle, os
# dest = os.path.join('movieclassifier', 'pkl_objects')
# if not os.path.exists(dest):
# os.makedirs(dest)
pickle.dump(stop, open('data/stopwords.pkl', 'wb'), protocol=4)
pickle.dump(clf, open('data/classifier.pkl', 'wb'), protocol=4)
%%writefile movieclassifier/vectorizer.py
# 나중에 임포트할 수 있도록 별도의 파일에 `HashingVectorizer`를 저장합니다.
from sklearn.feature_extraction.text import HashingVectorizer
import re, os, pickle
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open('data/stopwords.pkl'), 'rb')
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text.lower())
text = re.sub('[\W]+', ' ', text.lower()) \
+ ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
먼저 현재 파이썬 디렉토리를 movieclassifer
로 변경합니다:
%reset
Once deleted, variables cannot be recovered. Proceed (y/[n])?
# import os
# os.chdir('movieclassifier')
%%reset
import numpy as np
label = {0:'양성', 1:'음성'}
example = ['I love this movie']
X = vect.transform(example)
print('예측: %s\n확률: %.2f%%' %\
(label[clf.predict(X)[0]],
np.max(clf.predict_proba(X))*100))
이 코드를 실행하기 전에 현재 위치가 movieclassifier
디렉토리인지 확인합니다.
import sqlite3
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 10:10:10' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)
다운로드한 깃허브 저장소에 들어있는 movieclassifier_with_update 디렉토리를 사용합니다(그렇지 않으면 movieclassifier
디렉토리를 복사해서 사용하세요).
import shutil
os.chdir('../movieclassifier_with_update')
shutil.copyfile('../movieclassifier/pkl_objects/classifier.pkl',
'./pkl_objects/classifier.pkl')
# SQLite 데이터베이스에 저장된 데이터로 분류기를 업데이트하는 함수를 정의합니다:
import pickle, sqlite3
import numpy as np
# 로컬 디렉토리에서 HashingVectorizer를 임포트합니다
from vectorizer import vect
def update_model(db_path, model, batch_size=10000):
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute('SELECT * from review_db')
results = c.fetchmany(batch_size)
while results:
data = np.array(results)
X = data[:, 0]
y = data[:, 1].astype(int)
classes = np.array([0, 1])
X_train = vect.transform(X)
clf.partial_fit(X_train, y, classes=classes)
results = c.fetchmany(batch_size)
conn.close()
return None
# 모델을 업데이트합니다:
cur_dir = '.'
# app.py 파일에 이 코드를 삽입했다면 다음 경로를 사용하세요.
# import os
# cur_dir = os.path.dirname(__file__)
clf = pickle.load(open(os.path.join(cur_dir,
'pkl_objects',
'classifier.pkl'), 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')
update_model(db_path=db, model=clf, batch_size=10000)
# classifier.pkl 파일을 업데이트하려면 다음 주석을 해제하세요.
# pickle.dump(clf, open(os.path.join(cur_dir,
# 'pkl_objects', 'classifier.pkl'), 'wb')
# , protocol=4)