%%time
!wget -c https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/en-zh_tw.tmx.gz
%%time
!wget -c http://nlp.csie.org/~rubentsui/files/TranslationTech1082/Regex/nytimes.txt.gz
%%!
gunzip en-zh_tw.tmx.gz
gunzip nytimes.txt.gz
%%time
phrase = '發酒瘋'; corpus = 'en-zh_tw.tmx'
!fgrep -B1 "$phrase" $corpus --color=always | sed '/--/d'
%%time
regex = 'performance[- ]enhanc\w+ \w+'
corpus = 'nytimes.txt'
print("Phrases matched and their frequencies:")
!egrep -i -o "$regex" $corpus | sort | uniq -c | sort -nr
print("\nConcordancer output:")
!egrep -i -A1 "$regex" $corpus --color=always