#!/usr/bin/env python # coding: utf-8 # # Table of Contents #

1.3 02. 「パトカー」＋「タクシー」＝「パタトクカシーー」

1.4 03. 円周率

1.5 04. 元素記号

1.6 05. n-gram

1.7 06. 集合

1.8 07. テンプレートによる文生成

1.9 08. 暗号文

1.10 09. Typoglycemia

1.11 参考リンク

# # 1章. 準備運動 # http://www.cl.ecei.tohoku.ac.jp/nlp100/#ch1 # ## 00. 文字列の逆順 # 文字列"stressed"の文字を逆に（末尾から先頭に向かって）並べた文字列を得よ # In[1]: s = 'stressed' rev = s[::-1] print(rev) # ## 01. 「パタトクカシーー」 # 「パタトクカシーー」という文字列の1,3,5,7文字目を取り出して連結した文字列を得よ． # In[2]: s = 'パタトクカシーー' s1357 = s[1-1] + s[3-1] + s[5-1] + s[7-1] print(s1357) # In[3]: s = 'パタトクカシーー' s1357 = s[::2] print(s1357) # ## 02. 「パトカー」＋「タクシー」＝「パタトクカシーー」 # # 「パトカー」＋「タクシー」の文字を先頭から交互に連結して文字列「パタトクカシーー」を得よ． # In[4]: s1 = 'パトカー' s2 = 'タクシー' s = ''.join([p + t for p, t in zip(s1, s2)]) print(s) # ## 03. 円周率 # "Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics."という文を単語に分解し，各単語の（アルファベットの）文字数を先頭から出現順に並べたリストを作成せよ． # In[5]: import re sentense = 'Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics.' words = re.sub(r'[.|,]', '', sentense).split() counts = [len(w) for w in words] print(counts) # ## 04. 元素記号 # "Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can."という文を単語に分解し， # 1, 5, 6, 7, 8, 9, 15, 16, 19番目の単語は先頭の1文字，それ以外の単語は先頭に2文字を取り出し，取り出した文字列から単語の位置（先頭から何番目の単語か）への連想配列（辞書型もしくはマップ型）を作成せよ． # In[6]: sentense = "Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can." words = re.sub(r'[.|,]', '', sentense).split() idx_list_first = [1, 5, 6, 7, 8, 9, 15, 16, 19] # 参考：http://qiita.com/tanaka0325/items/08831b96b684d7ecb2f7 dic = {w[:2 - int(i in idx_list_first)]:i for i, w in enumerate(words, 1)} dic # ## 05. n-gram # 与えられたシーケンス（文字列やリストなど）からn-gramを作る関数を作成せよ．この関数を用い，"I am an NLPer"という文から単語bi-gram，文字bi-gramを得よ． # In[7]: def n_gram(_in, n): return [_in[i:i+n] for i in range(len(_in)) if len(_in[i:i+n]) >= n] s = "I am an NLPer" # 単語bigram n_gram(s.split(), 2) # In[8]: # 文字bi-gram n_gram(s, 2) # In[9]: print('mono-gram(word)', n_gram(s.split(), 1)) print('mono-gram(str)', n_gram(s, 1)) # In[10]: print('tri-gram(word)', n_gram(s.split(), 3)) print('tri-gram(str)', n_gram(s, 3)) # ## 06. 集合 # "paraparaparadise"と"paragraph"に含まれる文字bi-gramの集合を，それぞれ, XとYとして求め，XとYの和集合，積集合，差集合を求めよ．さらに，'se'というbi-gramがXおよびYに含まれるかどうかを調べよ． # In[11]: s1 = "paraparaparadise" s2 = "paragraph" X = set(n_gram(s1, 2)) print(X) Y = set(n_gram(s2, 2)) print(Y) # In[12]: # 和集合 union = X.union(Y) print(union) # In[13]: # 積集合 intersec = X.intersection(Y) print(intersec) # In[14]: # 差集合 diff_X_Y = X.difference(Y) print(diff_X_Y) diff_Y_X = Y.difference(X) print(diff_Y_X) # In[15]: 'se' in X # In[16]: 'se' in Y # ## 07. テンプレートによる文生成 # # 引数x, y, zを受け取り「x時のyはz」という文字列を返す関数を実装せよ．さらに，x=12, y="気温", z=22.4として，実行結果を確認せよ． # # In[17]: def create_sentense_temp(x, y, z): return '{}時の{}は{}'.format(x, y, z) create_sentense_temp(12, '気温', 22.4) # ## 08. 暗号文 # 与えられた文字列の各文字を，以下の仕様で変換する関数cipherを実装せよ． # # 英小文字ならば(219 - 文字コード)の文字に置換 # その他の文字はそのまま出力 # この関数を用い，英語のメッセージを暗号化・復号化せよ． # In[18]: s='aあ' s.islower() # In[19]: def cipher(s:str): # chr: アスキーコードから文字へ # 219-ord: 文字から219アスキーコードへ return ''.join([chr(219-ord(c)) if 'a' <= c <= 'z' else c for c in s]) cipher('Hello World') # In[20]: cipher('abcdefghijkelomnopqrstuvwxyz') # In[21]: cipher(cipher('Hello World')) # ## 09. Typoglycemia # スペースで区切られた単語列に対して，各単語の先頭と末尾の文字は残し，それ以外の文字の順序をランダムに並び替えるプログラムを作成せよ．ただし，長さが４以下の単語は並び替えないこととする．適当な英語の文（例えば"I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind ."）を与え，その実行結果を確認せよ． # In[22]: import re import random s = "I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind ." def typoglycemia(not_sort_word_length=4): def typo(s): words = re.sub(r'[.|,|:]', '', s).split() return [w[0] + ''.join(random.sample(w[1:-1], len(w[1:-1]))) + w[-1] if len(w) > not_sort_word_length else w for w in words] return typo # [w[0] + random.shuffle(list(w[1:-2])) + w[-1] for w in words if len(w) > 4] typo = typoglycemia(not_sort_word_length=4) t = typo(s) ' '.join(t) # In[23]: # random shuffleは返り値なし s = 'abcdefg' l = list(s) random.shuffle(l) l # ## 参考リンク # - [言語処理100本ノック with Python（第1章）](http://qiita.com/gamma1129/items/37bf660cf4e4b21d4267) # - [言語処理100本ノック第1章 in Python](http://qiita.com/piyo56/items/eb72b496669f541055c3)