#!/usr/bin/env python # coding: utf-8 # ## The Emoji of Wikipedia # # *Labeling the planet with emoji and other symbols that appear in Wikipedia articles (the making of)* # I want to extract all occurances of all emoji (and emoji-like Unicode characters, e.g. ones that look symbolic rather than based in language) from Wikipedia. I already have my own indexer of a [Wikipedia database download](https://en.wikipedia.org/wiki/Wikipedia:Database_download) which is specifially for working with the Wikipedia pages that have a location associated with them. So all I need to do is use it to feed me all those pages, and then check if any of the emojis or Unicode characters I'm interested in appear in them. Let's get started! # In[1]: from pandas import DataFrame from pathlib import Path from wikiparse import config scratch_folder = Path(config.folder) # First, we need a list of as many emoji and Unicode characters as possible. I copied emoji and Unicode characters from anywhere I could find lists of them without any meta information, including [getemoji.com](https://getemoji.com/) and [this Medium post](https://medium.com/feedium/huge-list-of-unicode-and-emoji-symbols-to-copy-and-paste-df1f408767a6). This approach is kind of haphazard, but given that this is just a fun afternoon project, that's fine. Because it seems to cause rendering issues, I've put the emoji in a file. # In[2]: with open('unicode_collection.txt', encoding='utf-8') as f: misc_unicode = f.read() unicodeset = set(misc_unicode) import random print(' '.join(random.sample(unicodeset, 100))) # After running the following code a few times, these are the symbols that occured a lot that they became less interesting (or, they're used in language rather than as a symbol). Again, not really being systematic here; this is a bit arbitrary. # In[3]: removals = set('\n #*-$−=Ἀ°Ῥὸ©ἔἸἀ™Ἄῦἴἱὰ→ἄᾴἰῖἝᾶῶὮὶὔἁῆ€ῷὐ─ἐ₨₽᾽ἡ∼∆₹₱ὄὴὁῴ\ Ἱ≈ῃ∗ἂἾἤὕὨ∟元ἜἘἑἕὼὈ円∀ῥ≥Ἠὺ♂®ὖὲἙἼ≤Ὁῳ¢┌Ἡ℃⋆Ὄ️╫₤ῇ῾Ἁᾷ↓└Ἅ√╡ὦ↔╟῎ᾱ╣│℉¥╦원⋅₳←╨▪Ἦὑ¤↑') unicodeset = unicodeset - removals # Okay, ready to start iterating over those pages, time to boot up the indexer. If you're trying to run this at home and don't already have the index computed, this next step will take a while. # In[4]: from wikiparse import geo_indexer xml_filename = config.xml indexer = geo_indexer.Indexer(xml_filename, scratch_folder=scratch_folder) # In[5]: page_numbers = indexer.get_page_numbers() # In[6]: from collections import defaultdict import time # The following takes a few hours to run; it's commented out to save time if I have to rerun the whole notebook. It iterates through each Wikipedia page that has a geographic coordinate tag (~.65 million of them in this version of English Wikipedia), checking all characters in that page against the list of Unicode characters from above. # In[7]: # matched = set() # symbol_to_pages = defaultdict(list) # pages_to_process = len(page_numbers) # ts = time.time() # for i in range(pages_to_process): # page = indexer.get_page_by_num(page_numbers[i]) # intersection = set(page._full_text).intersection(unicodeset) # if len(intersection) > 0: # print(i, page.title, intersection) # matched = matched.union(intersection) # for symbol in list(intersection): # symbol_to_pages[symbol].append(page_numbers[i]) # print(f"processed {pages_to_process} in {round((time.time()-ts)/60)} minutes") # In[8]: import json # with open(scratch_folder/'symbols_to_page.json', 'w') as f: # f.write(json.dumps(symbol_to_pages)) # In[9]: with open(scratch_folder/'symbols_to_page.json') as f: symbols_to_pages = json.loads(f.read()) # In[10]: df = DataFrame(symbols_to_pages.items(), columns=["symbol", "pages"]) # In[11]: df # Okay, so we found about 250 symbols, some of which have a lot of occurances (like ♀ and ♦). How many only occur once? # In[12]: df['pagecount'] = df.pages.map(lambda l: len(l)) df.to_csv(scratch_folder/'emoji.csv') # In[13]: df[df.pagecount == 1] # Next, we need to extract the decimal coordinates from the pages and put them in the dataframe. # In[14]: df['lat'] = df.pages.map(lambda num: indexer.get_page_by_num(num[0]).coords()[0]) df['lon'] = df.pages.map(lambda num: indexer.get_page_by_num(num[0]).coords()[1]) # (It's complaining because there are a few coordinate tags that the indexer can't get actual coordinates from) # Then I'll put it into the JSON format that my website uses, making sure to skip any with invalid coordinates. # In[15]: output = [] for i,row in df[(df.lon != 0)].iterrows(): page = indexer.get_page_by_num(row.pages[0]) output.append({"title":row.symbol, "coordsDecimal":{"lat":row.lat, "lon":row.lon}, "page_name":page.title}) with open(scratch_folder/"emojis_of_wikipedia.json", 'w') as f: f.write(json.dumps(output)) # Done!