#!/usr/bin/env python # coding: utf-8 # Now that we have a SQLite database with indices, page titles, and coordinate strings, let's make a database where we extract all the metadata out of those coordinate strings so it's queryable. # # This should be run after the other notebook that extracts the coordinate strings. # In[33]: import csv import json from wikiparse import indexer, syntax_parser as sp import time import os import sqlite3 import random # In[34]: dumps = indexer.load_dumps(build_index=False, scratch_folder='py3') english = dumps['en'] # In[35]: # english.db.close() # In[36]: c = english.cursor # Before we create the database let's get a complete list of the entries we're going to want. That is, let's look at all the coordinate strings we've extracted from each page and extract the list of keywords from there. # In[38]: result = c.execute('''SELECT page_num,coords,title FROM indices WHERE coords != "" ''').fetchall() coordStrings = {item[0]:item[1] for item in result} idx_to_title = {item[0]:item[2] for item in result} list(coordStrings.items())[:10] # In[39]: len(coordStrings), type(coordStrings) # If a page has more than one coordinate string, choose the one that's displayed at the top (`display=title`) or the first. # In[40]: for page_num in coordStrings: if '||' in coordStrings[page_num]: pageCoordStrings = coordStrings[page_num].split('||') coordStrings[page_num] = pageCoordStrings[0] for s in pageCoordStrings: if "display=title" in s: coordStrings[page_num] = s # In[41]: list(coordStrings.items())[:10] # For some Coord templates, there's a `note` (https://en.wikipedia.org/wiki/Template:Coord#Examples) which contains more pipes that will cut off the rest of the template. Since that tag seems to come after the other important tags, let's # ignore this problem. # In[42]: def getKeywords(coordString, verbose=False): if verbose: print(coordString) keywords = {} rest = [] items = coordString.split('|') for item in items: if '=' in item: keywords[item.split('=')[0]] = item.split('=')[1] elif ':' in item: keywords[item.split(':')[0]] = item.split(':')[1] else: rest.append(item) # return keywords, '|'.join(rest) return keywords print(getKeywords('Coord|8|8|N|165|1|W|type:event|name=Apollo 8 landing')) print(getKeywords('Coord|64|N|150|W|region:US-AK_type:adm1st_scale:10000000|display=title|notes={{Cite gnis|1785533|State of Alaska')) print(getKeywords('Coord|36|42|N|3|13|E|type:city')) # We only care about a subset of keywords, so let's make a whitelist. # In[43]: from collections import defaultdict keywords = defaultdict(int) for coordString in coordStrings.values(): for kw in getKeywords(coordString).keys(): keywords[kw.strip()] += 1 keywords # In[44]: {kw for kw,count in keywords.items() if count > 10} # Most of these look good except for a few that are duplicates or a weird artifact from the imperfect string processing. # In[45]: entriesList = [ 'accessdate', 'date', 'dim', 'display', 'elevation', 'format', 'globe', 'id', 'name', 'nosave', 'notes', 'publisher', 'reason', 'region', 'scale', 'source', 'title', 'type', 'upright', 'url', 'work'] for e in entriesList: print(e, 'TEXT,') # It would be nice to see examples of values for the keywords so we can see how they're used. # In[46]: from numpy.random import shuffle def findKeywordExample(kw, count=1, debug=False): kws = [] coordStringsList = list(coordStrings.values()) shuffle(coordStringsList) for cs in coordStringsList: if f'|{kw}=' in cs: kws.append(cs.split(f'|{kw}=')[1].split('|')[0]) if debug: print(cs) elif f'|{kw}:' in cs: kws.append(cs.split(f'|{kw}:')[1].split('|')[0]) if debug: print(cs) if len(kws) >= count: break return kws # In[47]: findKeywordExample('name', 10, True) # Now we can create all the columns en masse. # In[50]: c.execute('''CREATE TABLE coords (coords TEXT, lat REAL DEFAULT 0, lon REAL DEFAULT 0, page_num INTEGER PRIMARY KEY, accessdate TEXT DEFAULT '', date TEXT DEFAULT '', dim TEXT DEFAULT '', display TEXT DEFAULT '', elevation TEXT DEFAULT '', format TEXT DEFAULT '', globe TEXT DEFAULT '', id TEXT DEFAULT '', name TEXT DEFAULT '', nosave TEXT DEFAULT '', notes TEXT DEFAULT '', publisher TEXT DEFAULT '', reason TEXT DEFAULT '', region TEXT DEFAULT '', scale TEXT DEFAULT '', source TEXT DEFAULT '', title TEXT DEFAULT '', type TEXT DEFAULT '', upright TEXT DEFAULT '', url TEXT DEFAULT '', work TEXT DEFAULT '') ''') # In[51]: # c.execute('''DROP TABLE coords''') # In[59]: def extract_lat_lon(coord_string): split = coord_string.split('|') coord_list = [] for s in split: if ':' in s or '=' in s: break if 'Coord' not in s and 'LAT' not in s and 'LONG' not in s and 'coord' not in s: coord_list.append(s) return coord_list begin = random.randint(0, len(coordStrings)-10) for i in range(begin, begin+10): print(extract_lat_lon(list(coordStrings.values())[i])) # In[60]: def convert_to_decimal(coord_list): # print(' '.join(coord_list), end='\t') coord_list = [s.strip().lower() for s in coord_list if s.strip() != ''] if len(coord_list) < 2: return [0, 0] if len(coord_list) == 2: return [float(coord_list[0]), float(coord_list[1])] directions = 0 for s in coord_list: s = s.strip().lower() if s and s.strip() in 'nesw': directions += 1 if directions != 2: raise Exception(directions, "wrong number of directions for:", coord_list) lat = [] lon = [] creating_lat = True for s in coord_list: s = s.strip().lower() if s == '': continue if creating_lat: if s in 'ns': creating_lat = False while len(lat) < 3: lat.append(0) if s == 'n': lat.append(1) else: lat.append(-1) else: if ',' in s: s = s.replace(',', '.') lat.append(float(s)) else: if s in 'ew': while len(lon) < 3: lon.append(0) if s == 'e': lon.append(1) else: lon.append(-1) else: if ',' in s: s = s.replace(',', '.') lon.append(float(s)) return [ (lat[0] + lat[1]/60 + lat[2]/3600) * lat[3], (lon[0] + lon[1]/60 + lon[2]/3600) * lon[3] ] count = 1000 begin = random.randint(0, len(coordStrings)-count) for i in range(begin, begin+count): coord_list = extract_lat_lon(list(coordStrings.values())[i]) try: result = convert_to_decimal(coord_list) except Exception as e: print(coord_list, e) # print(result) # In[61]: def insertKeywordDict(page_num, coords, kws, debug=False): allowed_keys = [kw for kw in kws.keys() if kw in entriesList] allowed_vals = [kws[kw] for kw in allowed_keys] try: lat,lon = convert_to_decimal(extract_lat_lon(coords)) except: lat = lon = 0 keys = ['coords', 'lat', 'lon', 'page_num'] + allowed_keys vals = [coords, lat, lon, page_num] + allowed_vals qstring = f'INSERT INTO coords (\ {",".join(keys)}) VALUES (\ {",".join(["?" for item in vals])})' if debug: print(qstring, vals) c.execute(qstring, vals) # insertKeywordDict({'region': 'US-AK_type', 'display': 'title', 'notes': '{{Cite gnis'}) # Grab a drink before running the next cell, it'll take a while. # In[71]: start = 100_000 succeeded = 0 coordStringsList = list(coordStrings.items()) for i in range(start, len(coordStringsList)): page_idx,coordString = coordStringsList[i] try: keywords = getKeywords(coordString) keywords['title'] = idx_to_title[page_idx] insertKeywordDict(page_idx, coordString, keywords) succeeded += 1 except sqlite3.IntegrityError: pass if i % 100 == 0: english.db.commit() print(f'{round(100*i/len(coordStrings), 2)} ({i}) succeeded: {succeeded}', end='\r') # In[72]: # c.execute('''SELECT * FROM coords WHERE page_num > 19000000''').fetchall() # In[73]: l = c.execute('''SELECT title FROM coords WHERE lat BETWEEN 47.6 AND 47.7 AND lon BETWEEN -122.35 AND -122.34 ''').fetchall() l # How many cities do we have? # In[92]: l = c.execute('''SELECT title,coords FROM coords WHERE type == "city" ''').fetchall() len(l) # What's the coverage for this tag? # In[75]: [el[0] for el in l if 'Seattle' in el[0]] # In[27]: [el[0] for el in l if 'Portland' in el[0]] # In[29]: [el[0] for el in l if 'Brooklyn' in el[0]] # Not great. # In[98]: l = c.execute('''SELECT title,type,lat,lon FROM coords''').fetchall() [el for el in l if 'Seattle' == el[0]] # That's because Seattle isn't tagged as a city. # In[99]: [el for el in l if "Portland, Oregon" == el[0]] # In[101]: [el for el in l if "San Francisco" == el[0]] # In[102]: len(l) # Hmm, probably too many to do TF-IDF on all of them. Can we narrow it down? # In[132]: l = c.execute('''SELECT title,coords FROM coords WHERE display == "inline,title" ''').fetchall() len(l) # In[133]: [el for el in l if 'Seattle' == el[0]] # In[135]: [el for el in l if 'Portland, Oregon' == el[0]] # This narrows it down by ~50%, at least. # In[136]: l[:50] # I could also open this dataset of cities that would narrow it down a lot further, but I would have to do a bunch of fuzzy name matching probably :( # In[112]: import pandas as pd # In[116]: index_col = ['nn', 'name', 'name_ascii', 'other_names', 'lat', 'lon', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'region', 'date'] # In[117]: len(index_col) # In[120]: cities_df = pd.read_csv('cities15000.tsv', sep='\t', header=None, names=index_col) # In[126]: cities_df[cities_df.f != float('nan')].sample(10) # Insert fuzzy name matching (if necessary)