#!/usr/bin/env python # coding: utf-8 # # Modules and packages importing # # The main modules are the following with their utility: # # - ```nltk``` the best known and most used module in the natural language processing community. # - ```bs4``` or ```Beautifulsoup``` allows parsing of the html code in order to access more easily to some tags or specific elements in some web page. # - ```requests``` for reading the link from the web. # - ```re``` is useful for doing textual processing such as searching for patterns or correcting errors in the extracted texts. # - ```numpy``` to manipulate matrices and vectors and the mathematical operations between these objects such as linear algebra. # - ```matplotlib``` for graphic illustrations. # - ```datatime``` and ```timeit``` will provide a measure of the amount of time that the extraction will take according to different methods in order to speed up the program. # - the magic command ```%matplotlib notebook``` allows you to browse the graphics and choose their size and format manually. # In[1]: import datetime from timeit import default_timer as timer import matplotlib import matplotlib.pyplot as plt import requests import re from bs4 import * import numpy as np import nltk import sys get_ipython().run_line_magic('matplotlib', 'notebook') # ## Additional tools to make the displayed results look pretty # ```Join_func``` will allow to get the sentence (in one piece) from an ordered list of the words in that sentence. It does the opposite of word tokenization does. ```color``` class allows you to choose the font style for the printed elements # In[2]: def join_func(sentence): sentence = ' '.join(sentence) # join normally sentence = re.sub(" ([,.;\):])", lambda m: m.group(1), sentence) # stick to left sentence = re.sub("([\(]) ", lambda m: m.group(1), sentence) # stick to right sentence = re.sub(" ([']) ", lambda m: m.group(1), sentence) # join both sides return(sentence) class color: PURPLE = '\033[95m' BLACK = '\033[1;90m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[1;92m' YELLOW = '\033[93m' RED = '\033[1;91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' BBCKGRND = '\033[0;100m' RBCKGRND = '\033[0;101m' print(color.BLUE + color.BOLD + 'Hello World !' + color.END ) # # Download the file containing all the titles # # We download the file $A^1$ which is compressed. After decompressing it, we get file $B^2$ which contains the titles of all wikipedia articles in a given (desired) language. In our case it will be the "*simplewiki*" language which contains the wikipedia articles that have been simplified. Actually, since there are much more articles in standard English "**enwiki**" (which we consider as complex) than those that have been simplified "**simplewiki**", we thus only extract the simple titles in order to reduce the number of texts that will have only one of the two versions. Remember that the goal is to extract for each article both simple and complex versions (i.e. **simplewiki** and **enwiki**). One can find the files in the following link wikimedia dumps*. # # $^1$ simplewiki-20230123-all-titles-in-ns-0(1).gz $\newline$ # $^2$ simplewiki-20230123-all-titles-in-ns-0(1) $\newline$ # $^{\ast}$ https://dumps.wikimedia.org/other/pagetitles/20230123/ # ## Title file reading and pre-processing # There are titles that contain a lot of spaces, after deleting these extra spaces as well as the first and last titles to lighten the file we obtain a list that contains the **simplewiki** titles to be extracted from the web. # In[3]: file = "simplewiki-20230123-all-titles-in-ns-0(1)" #file = "simplewiki-20200420-all-titles" titl = open(file,"r").readlines() title = [w[w.find("\t")+1:] for w in titl[4:-5000]] titles = np.array([str(re.sub('\"',"",w[:-1])) for w in title]) print(titles[:5]) # In[4]: print(str("The number of titles is: " + color.BOLD + "{}" + color.END + ".").format(len(titles))) # ### List of links corresponding to each title: # We transform the list of titles into a list of links that redirect to the web page for the article that corresponds to each title # ### Simple urls # In[5]: def simple_wiki(t): return "https://simple.wikipedia.org/wiki/" + str(t) simple_wiki = np.vectorize(simple_wiki) # s_url are wikipedia pages written on simple english s_url = np.array(list(map(simple_wiki,titles[:100]))) print(str("Number of effectively extracted simple urls is " + color.BOLD + "{}" + color.END + ".").format(len(s_url)), "\nFirst three simple urls: ", *s_url[:3], sep="\n") # ### Complex urls # The only difference is the keyword "en" instead of "simple" at the beginning of each url # In[6]: def complex_wiki(t): return "https://en.wikipedia.org/wiki/" + str(t) complex_wiki = np.vectorize(complex_wiki) # c_url are wikipedia pages written on complex or standard english c_url = np.array(list(map(complex_wiki,titles[:100]))) print(str("Number of effectively extracted complex urls is " + color.BOLD + "{}" + color.END + ".").format(len(c_url)), "\nFirst three complex urls: ", *c_url[:3], sep="\n") # ## The extraction of the corresponding web page content to each title # We define a function that receives a list of links and returns the content of the page that corresponds to each link (i.e. each title). ```srqst()``` function for simple urls and ```crqst()``` for complex ones. # ### ```srqst()```: Requesting simple texts # In[7]: def srqst(t): lst = str() data = requests.get(t) # getting the raw web page content soup = BeautifulSoup(data.text,"html.parser") # parsing the content data2 = soup.find_all("p") # accessing the desired content ("p" as paragraph) if len(data2) >= 5 : for i in data2[:4] : if "Pages for" not in i.text : # This line allow to delete empty extractions lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text return lst vs = np.vectorize(srqst) ls = np.array(list(map(vs, s_url[:10]))) for i,j in enumerate(ls[:2]): print(str(color.BOLD + "Complex text number ({}):" + color.END).format(i+1), j) # ### ```crqst()```: Requesting complex texts # In[8]: def crqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") try : if len(data2) >= 4 : for i in data2[:4] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text except: KeyboardInterrupt return lst vc = np.vectorize(crqst) lc = np.array(list(map(vc, c_url[:10]))) for i,j in enumerate(lc[:2]): print(str(color.BOLD + "Complex text number ({}):" + color.END).format(i+1), j) # **Remark** # # Note that we could have used the command ``try:... except...`` to be sure that the program will continue to run even when there are extraction problems, when the page no longer exists or the content is not in the right location. # As we can see, there can be errors in the extracted texts, such as the numbers in square brackets indicating the different definitions or the different texts for the same title. Sometimes there are also punctuation problems, such as the comma (or period) not being followed by a space, which will let the function, which splits the sentence into words, believe that the two words surrounding the comma are one word and therefore will not split them into two. As for the example below : # ### Cleaning data # In[9]: example_sequence = "firstword secondword.thirdword 3.14" print("Raw example: ", example_sequence) print(color.RED + color.BOLD+ "Word tokenized raw example: " + color.END, nltk.word_tokenize(example_sequence),end="\n\n") corrected_example_sequence = re.sub(r"(\D)\.(\D)",r"\1. \2", example_sequence) print("The corrected example: ", corrected_example_sequence) print(color.GREEN + color.BOLD+"Word tokenized corrected example: " + color.END, nltk.word_tokenize(corrected_example_sequence)) # Notice that we want to put a space after the dot separating two non digit words, but we want to keep it as it is for numbers. Here is another example with extra spaces and comma problem : # In[10]: example2 = "Containing some errors like the following brackets[1], [2][3] or the coma,between two words 3,5.. " deleting_brackets = re.sub(r"\[.*?\]","", example2) correcting_comma = re.sub(r"(\D),(\D)", r"\1, \2", deleting_brackets) correcting_spaces = re.sub(r"\s+", " ", correcting_comma) print(correcting_spaces) # # Starting extraction # Let $n$ be the number of texts we want to extract. We use ```np.random.randint``` to select randomly $n$ titles from title list. # In[11]: n = 100 indx = np.random.randint(0, len(titles), size=n, dtype='int') selected_titles = titles[indx] s_url = np.array(list(map(simple_wiki, selected_titles))) c_url = np.array(list(map(complex_wiki,selected_titles))) #print(s_url[:2], "\n", c_url[:10]) print("First two simple urls: ", *s_url[:2], "\nFirst two complex urls: ", *c_url[:2], sep="\n") # ## Getting balanced couples for each text # ### First filtering # We can remark that most of texts have differents sizes according to the language they are extracted from. In order to have similar sizes between both version **simplewiki** and **enwiki**, we will filter extracted texts to keep only those that have sizes between $100$ and $1400$ characters in both versions. For the example below, we only use it accross $15$ texts to quickly access the results. # # We also use ```np.vectorize``` to the function in order to apply the function to list elements in parallel way using the python built-in function ```map()```. # In[12]: def srqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") if len(data2) >= 5 : for i in data2[:5] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text return lst vs = np.vectorize(srqst) ls = np.array(list(map(vs, s_url[:15]))) def crqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") try : if len(data2) >= 3 : for i in data2[:3] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text except: KeyboardInterrupt return lst vc = np.vectorize(crqst) lc = np.array(list(map(vc, c_url[:15]))) idx1 = np.array(list(map(lambda x : 100= 5 : for i in data2[:4] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text return lst #print(srqst(urls[12]), "\n") def crqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") try : if len(data2) >= 4 : for i in data2[:4] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text except: KeyboardInterrupt return lst simpl={} idx = [] for k,i in enumerate(s_url): try: T = srqst(i) if len(T) > 100: simpl[i[34:]]= T except AttributeError: idx.append(k) print(color.BOLD + "Simple Versions :\n"+ color.END) for i,k in enumerate(list(simpl.keys())[:2]): print(str(i) + ") ", (k, re.sub("\n", " ", simpl[k])), "\n") compl={} idx2 = [] for k,i in enumerate(c_url): try: T = crqst(i) if len(T) > 200: compl[i[30:]]= T except AttributeError: idx2.append(k) print(color.BOLD + "Complex Versions :\n" + color.END) for i,k in enumerate(list(compl.keys())[:2]): print(str(i) + ") ", (k, re.sub("\n", " ", compl[k])), "\n") keys_a = set(simpl.keys()) keys_b = set(compl.keys()) intersection = keys_a & keys_b print(str(color.BOLD + "number of common titles is " + color.RED+"{}"+color.END).format(str(len(intersection))), end="\n") duration = timer() - start print(str(color.BOLD + color.BLUE + "Duration : " + color.END + "{}s").format(duration)) # ## Second Method using ```map()``` function # In[21]: start = timer() def srqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") if len(data2) >= 5 : for i in data2[:5] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text return lst vs = np.vectorize(srqst) ls = map(vs, s_url) def crqst(t): lst = str() data = requests.get(t) soup = BeautifulSoup(data.text,"html.parser") data2 = soup.find_all("p") try : if len(data2) >= 4 : for i in data2[:4] : if "Pages for" not in i.text : lst += i.text else : for i in data2 : if "Pages for" not in i.text : lst += i.text except: KeyboardInterrupt return lst vc = np.vectorize(crqst) lc = map(vc, c_url) duration = timer() - start print(str("duration before putting results in a list: " + color.BOLD + "{}s" + color.END).format(duration)) start= timer() ls = np.array(list(ls)) lc = np.array(list(lc)) # More cleaning ls = ls[np.delete(range(len(lc)), np.where(lc =='Other reasons this message may be displayed:\n'))] lc = lc[np.delete(range(len(lc)), np.where(lc =='Other reasons this message may be displayed:\n'))] duration = timer() - start print(str(color.BOLD + color.BLUE + "Duration: " + color.END + color.BOLD + "{}s" + color.END).format(duration)) # **Conclusion** # # We can conclude that the second method that uses the mapping operation ```map()``` takes less time (around $44$ seconds) than the first one (around $112$ seconds) that uses the ```for``` loop to extract $37$ common texts (simple and complex) among $50$ from wikipedia. # In[22]: # First filtering idx1 = np.array(list(map(lambda x : 100