#!/usr/bin/env python
# coding: utf-8

# For details see https://skeptric.com/schema-jobposting

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


import sys
import gzip
import rdflib
from urllib.request import urlretrieve
from pathlib import Path

from tqdm.notebook import tqdm


# In[3]:


sys.path.insert(0, '../src')


# In[4]:


from lib.rdftool import *


# Data From http://webdatacommons.org/structureddata/2019-12/stats/schema_org_subsets.html
# 
# Download both the microdata (1.9GB) and the JSON-LD (700MB)

# In[5]:


DEST_DIR = Path('..') / 'data' / 'webcommons'
DEST_DIR.mkdir(parents=True, exist_ok=True)


# In[6]:


class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)  # will also set self.n = b * bsize

def download(url, filename, overwrite=False):
    filename = Path(filename)
    if (not filename.exists()) or overwrite:
        with TqdmUpTo(unit = 'B', unit_scale = True, unit_divisor = 1024, miniters = 1, desc = Path(filename).name) as t:
            urlretrieve(url, filename = filename, reporthook = t.update_to)


# In[7]:


JOBS_JSON_2019 = DEST_DIR / '2019-12_json_JobPosting.gz'


# In[31]:


JOBS_MD_2019 = DEST_DIR / '2019-12_md_JobPosting.gz'


# In[8]:


download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/json/schema_JobPosting.gz',
         JOBS_JSON_2019)


# In[35]:


download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/md/schema_JobPosting.gz',
         JOBS_MD_2019)


# [N-quads](https://www.w3.org/TR/n-quads): Subject Predicate Object Graph
# 
# First few lines:
# ```
# (node with id) (has schema type) (Job posting) (from URL)
# (same node)  (has identifier) (another node) (from same URL)
# (same node) (has title) "Category Manager - Prof. Audio Visual Solutions" (from Same URL)
# (same node) (has description) (doubly encoded HTML job description) (from same URL)
# (same node) (has hiring organisation) (hirer node) (from same URL)
# ...
# (hirer node) (has schema type) (Organization) (form same URL)
# (hirer node) (has name) "Anixter International" (from same URL)
# ...
# ```

# In[9]:


get_ipython().system('zcat {JOBS_JSON_2019} |  head -n 20')


# # JSON

# In[356]:


json_f = gzip.open(JOBS_JSON_2019, 'rt')


# In[357]:


json_all_graphs = parse_nquads(json_f)


# In[358]:


json_seen_domains = set()
json_graphs = []


# In[359]:


json_skipped = []


# In[360]:


for _ in tqdm(range(100_000)):
    graph = next(json_all_graphs)
    dom = graph_domain(graph)
    if dom in json_seen_domains:
        continue
    
    try:
        jp = list(get_job_postings(graph))[0]
        json_graphs.append((graph, jp))
        json_seen_domains.update([dom])
    except IndexError:
        json_skipped.append((graph.identifier, dom))
        continue


# In[362]:


len(json_seen_domains), len(json_skipped), len(json_graphs)


# In[369]:


[(p, o) for graph, s in json_graphs for p, o in graph.predicate_objects(s)][0]


# pd.DataFrame(c.items(), columns=['type', 'n']).assign(pct = lambda df: df['n'] / len(seen_domains)).sort_values('n', ascending=False)

# # Microdata

# In[319]:


f = gzip.open(JOBS_MD_2019, 'rt')


# In[320]:


all_graphs = parse_nquads(f)


# In[321]:


seen_domains = set()
graphs = []


# In[322]:


skipped = []


# In[323]:


for _ in tqdm(range(100_000)):
    graph = next(all_graphs)
    dom = graph_domain(graph)
    if dom in seen_domains:
        continue
    try:
        jp = list(get_job_postings(graph))[0]
    except IndexError:
        skipped.append((graph.identifier, dom))
    seen_domains.update([dom])
    graphs.append((graph, jp))


# In[330]:


len(seen_domains), len(skipped), len(graphs)


# In[310]:


graph, jp = graphs[0]


# In[312]:


[p for p, o in graph.predicate_objects(jp)]


# In[267]:


pd.DataFrame(c.items(), columns=['type', 'n']).assign(pct = lambda df: df['n'] / len(seen_domains)).sort_values('n', ascending=False)


# # Analysis

# In[501]:


len(json_graphs), len(graphs)


# How often is each type present from JSON-LD graphs

# In[474]:


j_counts = pd.DataFrame([Counter(p for p, o in graph.predicate_objects(s)) for graph, s in json_graphs])


# In[477]:


j_missing = j_counts.isna().mean().sort_values()
(1 - j_missing).to_frame().T


# In[487]:


m_counts = pd.DataFrame([Counter(p for p, o in graph.predicate_objects(s)) for graph, s in graphs])


# In[493]:


m_missing = m_counts.isna().mean().sort_values()
(1 - m_missing).to_frame().T


# In[485]:


def prop_more_than_1(x):
    return (x > 1).mean()


# In[492]:


j_counts.agg(['min', 'mean', 'max', prop_more_than_1])[j_missing.index]


# In[494]:


m_counts.agg(['min', 'mean', 'max', prop_more_than_1])[m_missing.index]


# ## Deeper analysis

# In[510]:


SDO = rdflib.namespace.Namespace('http://schema.org/')
def extract_property(graphs, sdo_type):
    predicate = SDO[sdo_type]
    for items in ([graph_to_dict(graph, o) if isinstance(o, rdflib.term.BNode) else o.toPython() for o in graph.objects(s, predicate)] for graph, s in graphs):
        if items:
            yield items


# In[606]:


SDO = rdflib.namespace.Namespace('http://schema.org/')
def extract_types(graphs, sdo_type):
    predicate = SDO[sdo_type]
    for graph, s in graphs:
        items = list(graph.objects(s, predicate))
        if items:
            item = items[0]
            if isinstance(item, rdflib.term.BNode):
                try:
                    dtype = list(graph.objects(item, rdflib.namespace.RDF.type))
                    yield dtype[0].toPython()
                except Exception:
                    yield 'Unknown Object'
            elif isinstance(item, rdflib.term.Literal):
                dtype = type(item.toPython())
                if dtype == rdflib.term.Literal:
                    yield item.datatype.toPython()
                else:
                    yield dtype
            elif isinstance(item, rdflib.term.URIRef):
                yield 'URI'
            else:
                yield 'Unknown'


# ### Title

# In[607]:


Counter(extract_types(json_graphs, 'title')), Counter(extract_types(graphs, 'JobPosting/title'))


# In[666]:


list(extract_property(json_graphs, 'title'))[:5]


# In[667]:


list(extract_property(graphs, 'JobPosting/title'))[:5]


# ### Description

# In[608]:


Counter(extract_types(json_graphs, 'description')), Counter(extract_types(graphs, 'JobPosting/description'))


# ### JobLocation

# In[609]:


Counter(extract_types(json_graphs, 'jobLocation')), Counter(extract_types(graphs, 'JobPosting/jobLocation'))


# In[670]:


list(extract_property(json_graphs, 'jobLocation'))[:3]


# In[671]:


list(extract_property(graphs, 'JobPosting/jobLocation'))[:3]


# In[734]:


def extract_subtype(rdf_type, subtype, json=True):
    if json:
        data_graphs = json_graphs
    else:
        data_graphs = graphs
        rdf_type = 'JobPosting/' + rdf_type
    return [loc[0] for loc in extract_property(data_graphs, rdf_type) if loc and isinstance(loc[0], dict) and loc[0].get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') == ['http://schema.org/' + subtype]]


# Totals (1843, 2820)

# Common attributes for jobLocation

# In[749]:


Counter(y for x in extract_subtype('jobLocation', 'Place') for y in x)


# In[750]:


Counter(y for x in extract_subtype('jobLocation', 'Place', False) for y in x)


# #### Job Location - Address

# In[790]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])


# In[791]:


c


# In[795]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])


# In[796]:


c


# In[786]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        c.update(address[0].keys())


# In[821]:


i = 0
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], str):
        print(address[0])
        i+=1
        if i > 10: break


# In[787]:


c


# md

# In[797]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        c.update(address[0].keys())


# In[798]:


c


# #### name

# In[822]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/name')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])
c


# In[823]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/name')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])
c


# In[824]:


i = 0
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/name')
    if address:
        print(address[0])
        i += 1
        if i>=5: break


# In[825]:


i = 0
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/name')
    if address:
        print(address[0])
        i += 1
        if i>=5: break


# #### geo

# In[799]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/geo')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])


# In[800]:


c


# In[801]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/geo')
    if address:
        if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]:
            c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]])
        else:
            c.update([type(address[0])])


# In[802]:


c


# In[803]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/geo')
    if address and isinstance(address[0], dict):
        c.update(list(address[0]))


# In[804]:


c


# In[815]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/geo')
    if address and isinstance(address[0], str):
        print(address[0])


# In[813]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/geo')
    if address and isinstance(address[0], dict):
        c.update(list(address[0]))


# In[814]:


c


# In[806]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/geo')
    if address and isinstance(address[0], dict):
        address = address[0]
        if 'http://schema.org/latitude' in address:
            c.update([type(address['http://schema.org/latitude'][0])])
c


# In[807]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/geo')
    if address and isinstance(address[0], dict):
        address = address[0]
        if 'http://schema.org/latitude' in address:
            c.update([type(address['http://schema.org/longitude'][0])])
c


# In[817]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/geo')
    if address and isinstance(address[0], dict):
        address = address[0]
        if 'http://schema.org/GeoCoordinates/latitude' in address:
            c.update([type(address['http://schema.org/GeoCoordinates/latitude'][0])])
c


# In[818]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/geo')
    if address and isinstance(address[0], dict):
        address = address[0]
        if 'http://schema.org/GeoCoordinates/longitude' in address:
            print(address['http://schema.org/GeoCoordinates/longitude'][0], address['http://schema.org/GeoCoordinates/latitude'][0])
c


# In[809]:


i = 0
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/geo')
    if address and isinstance(address[0], dict):
        address = address[0]
        if 'http://schema.org/latitude' in address:
            i+=1
            print(address['http://schema.org/latitude'], address['http://schema.org/longitude'])
            if i > 10:
                break


# ### Postal Address

# In[827]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        c.update(address[0].keys())
c


# In[828]:


c = Counter()
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        c.update(address[0].keys())
c


# #### addressCountry

# In[837]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/addressCountry')
        if a:
            c.append(a[0])
c[:5]


# In[834]:


Counter(map(type, c))


# In[840]:


Counter(k for a in c for k in a if isinstance(a, dict))


# In[841]:


Counter(a['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0] for a in c for k in a if isinstance(a, dict))


# In[853]:


c = []
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/PostalAddress/addressCountry')
        if a:
            c.append(a[0])
c[:5]


# In[854]:


Counter(map(type, c))


# Empty...

# In[857]:


[a for a in c if isinstance(a, dict)]


# Country name

# In[865]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/addressCountry')
        if a and isinstance(a[0], dict):
            name = a[0].get('http://schema.org/name')
            if name:
                c.append(name[0])
c[:5], len(c), Counter(map(type, c))


# #### addressLocality

# In[870]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/addressLocality')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[872]:


c = []
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/PostalAddress/addressLocality')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[874]:


[a for a in c if isinstance(a, dict)]


# #### addressRegion

# In[877]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/addressRegion')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[879]:


c = []
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/PostalAddress/addressLocality')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[880]:


[a for a in c if isinstance(a, dict)]


# #### postalCode

# In[881]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/postalCode')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[882]:


c = []
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/PostalAddress/postalCode')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# #### streetAddress

# In[884]:


c = []
for x in extract_subtype('jobLocation', 'Place'):
    address = x.get('http://schema.org/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/streetAddress')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[885]:


c = []
for x in extract_subtype('jobLocation', 'Place', False):
    address = x.get('http://schema.org/Place/address')
    if address and isinstance(address[0], dict):
        a = address[0].get('http://schema.org/PostalAddress/streetAddress')
        if a:
            c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# # Base Salary

# In[887]:


Counter(extract_types(json_graphs, 'baseSalary')), Counter(extract_types(graphs, 'JobPosting/baseSalary'))


# In[893]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    dtype = x.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    if dtype:
        c.append(dtype[0])
Counter(c)


# In[894]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    dtype = x.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    if dtype:
        c.append(dtype[0])
Counter(c)


# In[896]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    c += list(x)
Counter(c)


# In[897]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    c += list(x)
Counter(c)


# #### currency

# In[900]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/currency')
    if a:
        c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# In[901]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/currency')
    if a:
        c.append(a[0])
c[:5], len(c), Counter(map(type, c))


# #### value

# In[911]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a:
        c.append(a[0])
[_ for _ in c if type(_) == str][:10], len(c), Counter(map(type, c))


# In[916]:


rdftype = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'


# In[918]:


Counter([_[rdftype][0] for _ in c if isinstance(_, dict) and rdftype in _])


# In[919]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a:
        c.append(a[0])
[_ for _ in c if type(_) == str][:10], len(c), Counter(map(type, c))


# In[920]:


Counter([_[rdftype][0] for _ in c if isinstance(_, dict) and rdftype in _])


# ### Quantitative Value

# In[930]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        c += a[0]
Counter(c)


# In[931]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        c += a[0]
Counter(c)


# #### unitText

# In[935]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/unitText')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[937]:


sorted(Counter(c).items(), key=lambda x:x[1], reverse=True)[:10]


# In[939]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/QuantitativeValue/unitText')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# #### minValue

# In[940]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/minValue')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[941]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/QuantitativeValue/minValue')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[945]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/maxValue')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[946]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/QuantitativeValue/maxValue')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[947]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/value')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# In[948]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/value')
    if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']:
        v = a[0].get('http://schema.org/QuantitativeValue/value')
        if v:
            c.append(v[0]) 
c[:5], len(c), Counter(map(type, c))


# ### Monetary Amount minvalue

# In[950]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/minValue')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[954]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/minValue')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[953]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount'):
    a = x.get('http://schema.org/maxValue')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[955]:


c = []
for x in extract_subtype('baseSalary', 'MonetaryAmount', False):
    a = x.get('http://schema.org/MonetaryAmount/maxValue')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# ## Date Posted

# In[610]:


Counter(extract_types(json_graphs, 'datePosted')), Counter(extract_types(graphs, 'JobPosting/datePosted'))


# In[672]:


list(extract_property(json_graphs, 'datePosted'))[:3]


# In[673]:


list(extract_property(graphs, 'JobPosting/datePosted'))[:3]


# ### Hiring Organization

# In[612]:


Counter(extract_types(json_graphs, 'hiringOrganization')), Counter(extract_types(graphs, 'JobPosting/hiringOrganization'))


# In[674]:


list(extract_property(json_graphs, 'hiringOrganization'))[:3]


# In[675]:


list(extract_property(graphs, 'JobPosting/hiringOrganization'))[:3]


# In[959]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization'):
    c += x
Counter(c)


# In[960]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization', False):
    c += x
Counter(c)


# #### name

# In[961]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization'):
    a = x.get('http://schema.org/name')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[962]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization', False):
    a = x.get('http://schema.org/Organization/name')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# #### sameAs

# In[963]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization'):
    a = x.get('http://schema.org/sameAs')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[964]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization', False):
    a = x.get('http://schema.org/Organization/sameAs')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# #### logo

# In[987]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization'):
    a = x.get('http://schema.org/logo')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[988]:


Counter([a[rdftype][0] for a in c if isinstance(a, dict) and rdftype in a])


# In[989]:


Counter([k for a in c if isinstance(a, dict) and rdftype in a for k in a])


# In[990]:


[a for a in c if isinstance(a, dict) and rdftype in a][:10]


# In[991]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization', False):
    a = x.get('http://schema.org/Organization/logo')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[992]:


Counter([k for a in c if isinstance(a, dict) and rdftype in a for k in a])


# In[993]:


[a for a in c if isinstance(a, dict) and rdftype in a][:10]


# #### url

# In[975]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization'):
    a = x.get('http://schema.org/url')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# In[976]:


c = []
for x in extract_subtype('hiringOrganization', 'Organization', False):
    a = x.get('http://schema.org/Organization/url')
    if a:
        c.append(a[0])
c[:10], len(c), Counter(map(type, c))


# ## validThrough

# In[613]:


Counter(extract_types(json_graphs, 'validThrough')), Counter(extract_types(graphs, 'JobPosting/validThrough'))


# In[676]:


list(extract_property(json_graphs, 'validThrough'))[:3]


# In[678]:


list(extract_property(graphs, 'JobPosting/validThrough'))[:3]


# ### url

# In[614]:


Counter(extract_types(json_graphs, 'url')), Counter(extract_types(graphs, 'JobPosting/url'))


# In[679]:


list(extract_property(json_graphs, 'url'))[:3]


# In[680]:


list(extract_property(graphs, 'JobPosting/url'))[:3]


# ### industry

# In[615]:


Counter(extract_types(json_graphs, 'industry')), Counter(extract_types(graphs, 'JobPosting/industry'))


# In[616]:


pd.Series(industry for industries in extract_property(json_graphs, 'industry') for industry in industries).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# In[617]:


pd.Series(industry for industries in extract_property(graphs, 'JobPosting/industry') for industry in industries).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# ### educationRequirements

# In[681]:


Counter(extract_types(json_graphs, 'educationRequirements')), Counter(extract_types(graphs, 'JobPosting/educationRequirements'))


# In[682]:


pd.Series(x for xs in extract_property(json_graphs, 'educationRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# In[683]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/educationRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# ### workHours

# In[627]:


Counter(extract_types(json_graphs, 'workHours')), Counter(extract_types(graphs, 'JobPosting/workHours'))


# In[686]:


list(extract_property(json_graphs, 'workHours'))[:10]


# In[687]:


list(extract_property(graphs, 'JobPosting/workHours'))[:10]


# ### experienceRequirements

# In[688]:


Counter(extract_types(json_graphs, 'experienceRequirements')), Counter(extract_types(graphs, 'JobPosting/experienceRequirements'))


# In[691]:


pd.Series(x for xs in extract_property(json_graphs, 'experienceRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(6).T


# In[692]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/experienceRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(6).T


# ### occupationalCategory

# In[634]:


Counter(extract_types(json_graphs, 'occupationalCategory')), Counter(extract_types(graphs, 'JobPosting/occupationalCategory'))


# In[636]:


pd.Series(x for xs in extract_property(json_graphs, 'occupationalCategory') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# In[637]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/occupationalCategory') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T


# ### qualifications

# In[640]:


Counter(extract_types(json_graphs, 'qualifications')), Counter(extract_types(graphs, 'JobPosting/qualifications'))


# In[643]:


pd.Series(x for xs in extract_property(json_graphs, 'qualifications') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(4).T


# In[646]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/qualifications') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(4).T


# ### identifier

# In[650]:


Counter(extract_types(json_graphs, 'identifier')), Counter(extract_types(graphs, 'JobPosting/identifier'))


# In[693]:


list(extract_property(json_graphs, 'identifier'))[:3]


# In[694]:


list(extract_property(graphs, 'JobPosting/identifier'))[:3]


# ### salaryCurrency

# In[651]:


Counter(extract_types(json_graphs, 'salaryCurrency')), Counter(extract_types(graphs, 'JobPosting/salaryCurrency'))


# In[654]:


pd.Series(x for xs in extract_property(json_graphs, 'salaryCurrency') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# In[695]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/salaryCurrency') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# ### employmentType

# In[698]:


Counter(extract_types(json_graphs, 'employmentType')), Counter(extract_types(graphs, 'JobPosting/employmentType'))


# In[696]:


pd.Series(x for xs in extract_property(json_graphs, 'employmentType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# In[697]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/employmentType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# When it's multiple it's normally a listing

# In[584]:


list(x for x in extract_property(json_graphs, 'employmentType') if len(x) > 1)[:20]


# ### jobBenefits

# In[656]:


Counter(extract_types(json_graphs, 'jobBenefits')), Counter(extract_types(graphs, 'JobPosting/jobBenefits'))


# In[700]:


list(extract_property(json_graphs, 'jobBenefits'))[:10]


# In[702]:


list(extract_property(graphs, 'JobPosting/jobBenefits'))[:3]


# ### Skills

# In[703]:


Counter(extract_types(json_graphs, 'skills')), Counter(extract_types(graphs, 'JobPosting/skills'))


# In[704]:


list(extract_property(json_graphs, 'skills'))[:10]


# In[705]:


list(extract_property(graphs, 'JobPosting/skills'))[:3]


# ### image

# In[663]:


Counter(extract_types(json_graphs, 'image')), Counter(extract_types(graphs, 'JobPosting/image'))


# In[708]:


list(extract_property(json_graphs, 'image'))[:3]


# In[707]:


list(extract_property(graphs, 'JobPosting/image'))[:3]


# ### jobLocationType

# In[709]:


Counter(extract_types(json_graphs, 'jobLocationType')), Counter(extract_types(graphs, 'JobPosting/jobLocationType'))


# In[712]:


pd.Series(x for xs in extract_property(json_graphs, 'jobLocationType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# In[713]:


pd.Series(x for xs in extract_property(graphs, 'JobPosting/jobLocationType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T


# ### incentiveCompensation

# In[714]:


Counter(extract_types(json_graphs, 'incentiveCompensation')), Counter(extract_types(graphs, 'JobPosting/incentiveCompensation'))


# In[715]:


list(extract_property(json_graphs, 'incentiveCompensation'))[:10]


# In[717]:


list(extract_property(graphs, 'JobPosting/incentiveCompensation'))[:10]