#!/usr/bin/env python # coding: utf-8 # # find love with NLTK <3 # # - mail: tamkien@cri-paris.org # - twitter: @wekeypedia # - website: http://github.com/wekeypedia # While looking at the content of wikipedia diff of [Love](http://en.wikipedia.org/w/Love), I was surprised by the number and the specificity of "vandalisms" on it. It make this page looking more like a high school table full of markings or a carved tree in the park. I find it more funny that deterioring the encyclopedia, a sign of the exteriority of this object of knowledge far from being a pure ideal space of wisdom. Luckily a lot of bots and users are working hard to delete and make those signs of love invisible. I just wanted to see what are the proportions of the phenomenom and also explore the other side of collective intelligence. In my mind, using [nltk](http://nltk.org) would make it very easy and lower the supervision to a minimum and would also be a good excercice to start using that library. # # First we are going to load the [diff from a cache](https://github.com/WeKeyPedia/notebooks/tree/master/wisdom/data/Love) and also used the [wekeypedia python library](http://github.com/wekeypedia/toolkit-python) to skip the data acquision and parsing part. to jump into the fun of basic usage of nltk tree tagger. # ## loading the datasets # In[1]: get_ipython().run_line_magic('run', '"libraries.ipynb"') # In[2]: def from_file(name): diff_txt = "" with codecs.open(name, "r", encoding="utf-8-sig") as f: data = json.load(f) return data def list_revisions(page): return os.listdir("data/%s" % (page)) def load_revisions(s): revisions = defaultdict(dict) p = wekeypedia.WikipediaPage(s) revisions_list = list_revisions(s) revisions_list = map(lambda revid: revid.split(".")[0], revisions_list) revisions = { revid : from_file("data/%s/%s.json" % (s, revid)) for revid in revisions_list } return revisions revisions = load_revisions("Love") print "revisions: %s" % len(revisions) # In[3]: page = wekeypedia.WikipediaPage("Love") # ## detect love # # The first thing that need to be done is to tokenize sentences with NLTK with `nltk.word_tokenize` that will use the wordnet corpus and `nltk.pos_tag` that use the Penn Treebank tagset. # # We are going to use very basically the three tagging by looking at sentences that include at least two proper nouns (`NNP`) and 3 words. Keeping every variations of "x loves y", "x + y = love", or "love is about x and y". The `pos_tag` function gives back basic results. For more accurate analysis, it will more usefull to usefull `nltk.ne_chunks()` and look for 2 x `PERSON` + 1 x `VERB`. # # ```python # def i_love_u(pos_tags): # return len([ t for t in pos_tags if t[1] == "NNP" and not("love" in t[0].lower())]) >= 2 # ``` # # We then make sure the sentence as at least 3 words but is also not too long. It is the usual edit&run signature. Some of them have produce more elaborate declarations of love but we will check them another time with other strategies. # # ```python # def correct_size(pos_tags): # return 2 < len(pos_tags) < 20 # ``` # # We also make sure one of the addition contains at least once the word "love" whatever its inflections or position. # # ```python # def contains_love(sentence): # return "love" in sentence.lower() # ``` # # We then compose all those conditions into one big chain of `and`. # In[4]: def is_it_love(sentence): result = False pos_tags = nltk.pos_tag(nltk.word_tokenize(sentence)) def i_love_u(pos_tags): return len([ t for t in pos_tags if t[1] == "NNP" and not("love" in t[0].lower())]) >= 2 def correct_size(pos_tags): return 2 < len(pos_tags) < 20 def contains_love(sentence): return "love" in sentence.lower() result = i_love_u(pos_tags) and correct_size(pos_tags) and contains_love(sentence) return result def find_love_line(source, sentence): line = -1 d = BeautifulSoup(source, 'html.parser') tr = [ tr for tr in d.find_all("tr") if sentence in tr.get_text() ] for previous in tr[0].find_previous_siblings(): if type(previous) == type(tr[0]) and len(previous.find_all("td", "diff-lineno")) > 0: line = previous.find("td").get_text() break line = line.split(" ")[1] line = line[0:-1] return int(line) def detect_love(revid): result = { "revid": revid, "love": [], "plusminus": {}, "lines": [] } diff_html = revisions[revid]["diff"]["*"] diff = page.extract_plusminus(diff_html) result["plusminus"] = diff rev_index = revisions.keys() print "\rrevision: %s/%s" % ( rev_index.index(revid), len(rev_index)), # result["love"] = [ sentence in diff["added"] if is_it_love(sentence) ] pos = 0 for sentence in diff["added"]: if is_it_love(sentence): result["love"].append(sentence) result["lines"].append(find_love_line(diff_html, sentence)) pos += 1 print " ♥︎", return result # revlist = random.sample(revisions.keys(), 100) revlist = revisions.keys() results = [ detect_love(revid) for revid in revlist] #results = [ detect_love(revid) for revid in [ "98452213" ] ] print "\r ", love = [ s for s in results if len(s["love"]) > 0 ] print "♥︎" * len(love) print len(love) # print love # ### alternative strategies # # - look for revert done by bots # - find other tree structures # - use reverse ip geolocalization to find where that kind of good stuff happened # ## cleaning and saving the result for manual checking # In[5]: final_result = [] for l in love: for s in l["love"]: if not("[" in s) and not("*" in s) and not("==" in s): final_result.append( [ l["revid"], s ] ) final_result = pd.DataFrame(final_result) final_result.columns = ["revision id", "sentence"] final_result.head() print len(final_result) final_result.to_csv("data/find_love.csv", encoding="utf-8") # The csv can be found on [github](https://github.com/WeKeyPedia/notebooks/blob/master/wisdom/data/find_love.csv) and [google docs](https://docs.google.com/spreadsheets/d/1kktXDS7FZOSu2eoKmsd35PVuUABIQw-q0qNl68lf-_0/edit?usp=sharing) # # ### manual cleaning of the dataset # # We proceed to check manually our result to find ~70 false positive and not knowing the number of false negative. However this is a solid base to start a semi-supervised machine learning classifier to find more fancy grammatical structures. # # You can find the [final cleaned csv](https://github.com/WeKeyPedia/notebooks/blob/master/wisdom/data/find_love.csv) on github too. # In[6]: final_result = pd.DataFrame.from_csv("data/find_love-checked.csv", encoding="utf-8") final_result = final_result.drop(final_result[final_result["false positive"] == 1].index) print len(final_result) # ## insert the love back into the page # # In order to write back all the marks of affection into the wikipedia page, we first need to extract where it was added first. The following implementation is not entirely accurate since its retrieve the line number in the previous version of the page and not the current one. Since it is only for play purpose, we will not be bother by that detail. # # Retrieving the line number is relatively easy, we just need to extract `