#!/usr/bin/env python # coding: utf-8 # For details see https://skeptric.com/schema-jobposting # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import sys import gzip import rdflib from urllib.request import urlretrieve from pathlib import Path from tqdm.notebook import tqdm # In[3]: sys.path.insert(0, '../src') # In[4]: from lib.rdftool import * # Data From http://webdatacommons.org/structureddata/2019-12/stats/schema_org_subsets.html # # Download both the microdata (1.9GB) and the JSON-LD (700MB) # In[5]: DEST_DIR = Path('..') / 'data' / 'webcommons' DEST_DIR.mkdir(parents=True, exist_ok=True) # In[6]: class TqdmUpTo(tqdm): def update_to(self, b=1, bsize=1, tsize=None): if tsize is not None: self.total = tsize self.update(b * bsize - self.n) # will also set self.n = b * bsize def download(url, filename, overwrite=False): filename = Path(filename) if (not filename.exists()) or overwrite: with TqdmUpTo(unit = 'B', unit_scale = True, unit_divisor = 1024, miniters = 1, desc = Path(filename).name) as t: urlretrieve(url, filename = filename, reporthook = t.update_to) # In[7]: JOBS_JSON_2019 = DEST_DIR / '2019-12_json_JobPosting.gz' # In[31]: JOBS_MD_2019 = DEST_DIR / '2019-12_md_JobPosting.gz' # In[8]: download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/json/schema_JobPosting.gz', JOBS_JSON_2019) # In[35]: download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/md/schema_JobPosting.gz', JOBS_MD_2019) # [N-quads](https://www.w3.org/TR/n-quads): Subject Predicate Object Graph # # First few lines: # ``` # (node with id) (has schema type) (Job posting) (from URL) # (same node) (has identifier) (another node) (from same URL) # (same node) (has title) "Category Manager - Prof. Audio Visual Solutions" (from Same URL) # (same node) (has description) (doubly encoded HTML job description) (from same URL) # (same node) (has hiring organisation) (hirer node) (from same URL) # ... # (hirer node) (has schema type) (Organization) (form same URL) # (hirer node) (has name) "Anixter International" (from same URL) # ... # ``` # In[9]: get_ipython().system('zcat {JOBS_JSON_2019} | head -n 20') # # JSON # In[356]: json_f = gzip.open(JOBS_JSON_2019, 'rt') # In[357]: json_all_graphs = parse_nquads(json_f) # In[358]: json_seen_domains = set() json_graphs = [] # In[359]: json_skipped = [] # In[360]: for _ in tqdm(range(100_000)): graph = next(json_all_graphs) dom = graph_domain(graph) if dom in json_seen_domains: continue try: jp = list(get_job_postings(graph))[0] json_graphs.append((graph, jp)) json_seen_domains.update([dom]) except IndexError: json_skipped.append((graph.identifier, dom)) continue # In[362]: len(json_seen_domains), len(json_skipped), len(json_graphs) # In[369]: [(p, o) for graph, s in json_graphs for p, o in graph.predicate_objects(s)][0] # pd.DataFrame(c.items(), columns=['type', 'n']).assign(pct = lambda df: df['n'] / len(seen_domains)).sort_values('n', ascending=False) # # Microdata # In[319]: f = gzip.open(JOBS_MD_2019, 'rt') # In[320]: all_graphs = parse_nquads(f) # In[321]: seen_domains = set() graphs = [] # In[322]: skipped = [] # In[323]: for _ in tqdm(range(100_000)): graph = next(all_graphs) dom = graph_domain(graph) if dom in seen_domains: continue try: jp = list(get_job_postings(graph))[0] except IndexError: skipped.append((graph.identifier, dom)) seen_domains.update([dom]) graphs.append((graph, jp)) # In[330]: len(seen_domains), len(skipped), len(graphs) # In[310]: graph, jp = graphs[0] # In[312]: [p for p, o in graph.predicate_objects(jp)] # In[267]: pd.DataFrame(c.items(), columns=['type', 'n']).assign(pct = lambda df: df['n'] / len(seen_domains)).sort_values('n', ascending=False) # # Analysis # In[501]: len(json_graphs), len(graphs) # How often is each type present from JSON-LD graphs # In[474]: j_counts = pd.DataFrame([Counter(p for p, o in graph.predicate_objects(s)) for graph, s in json_graphs]) # In[477]: j_missing = j_counts.isna().mean().sort_values() (1 - j_missing).to_frame().T # In[487]: m_counts = pd.DataFrame([Counter(p for p, o in graph.predicate_objects(s)) for graph, s in graphs]) # In[493]: m_missing = m_counts.isna().mean().sort_values() (1 - m_missing).to_frame().T # In[485]: def prop_more_than_1(x): return (x > 1).mean() # In[492]: j_counts.agg(['min', 'mean', 'max', prop_more_than_1])[j_missing.index] # In[494]: m_counts.agg(['min', 'mean', 'max', prop_more_than_1])[m_missing.index] # ## Deeper analysis # In[510]: SDO = rdflib.namespace.Namespace('http://schema.org/') def extract_property(graphs, sdo_type): predicate = SDO[sdo_type] for items in ([graph_to_dict(graph, o) if isinstance(o, rdflib.term.BNode) else o.toPython() for o in graph.objects(s, predicate)] for graph, s in graphs): if items: yield items # In[606]: SDO = rdflib.namespace.Namespace('http://schema.org/') def extract_types(graphs, sdo_type): predicate = SDO[sdo_type] for graph, s in graphs: items = list(graph.objects(s, predicate)) if items: item = items[0] if isinstance(item, rdflib.term.BNode): try: dtype = list(graph.objects(item, rdflib.namespace.RDF.type)) yield dtype[0].toPython() except Exception: yield 'Unknown Object' elif isinstance(item, rdflib.term.Literal): dtype = type(item.toPython()) if dtype == rdflib.term.Literal: yield item.datatype.toPython() else: yield dtype elif isinstance(item, rdflib.term.URIRef): yield 'URI' else: yield 'Unknown' # ### Title # In[607]: Counter(extract_types(json_graphs, 'title')), Counter(extract_types(graphs, 'JobPosting/title')) # In[666]: list(extract_property(json_graphs, 'title'))[:5] # In[667]: list(extract_property(graphs, 'JobPosting/title'))[:5] # ### Description # In[608]: Counter(extract_types(json_graphs, 'description')), Counter(extract_types(graphs, 'JobPosting/description')) # ### JobLocation # In[609]: Counter(extract_types(json_graphs, 'jobLocation')), Counter(extract_types(graphs, 'JobPosting/jobLocation')) # In[670]: list(extract_property(json_graphs, 'jobLocation'))[:3] # In[671]: list(extract_property(graphs, 'JobPosting/jobLocation'))[:3] # In[734]: def extract_subtype(rdf_type, subtype, json=True): if json: data_graphs = json_graphs else: data_graphs = graphs rdf_type = 'JobPosting/' + rdf_type return [loc[0] for loc in extract_property(data_graphs, rdf_type) if loc and isinstance(loc[0], dict) and loc[0].get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') == ['http://schema.org/' + subtype]] # Totals (1843, 2820) # Common attributes for jobLocation # In[749]: Counter(y for x in extract_subtype('jobLocation', 'Place') for y in x) # In[750]: Counter(y for x in extract_subtype('jobLocation', 'Place', False) for y in x) # #### Job Location - Address # In[790]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) # In[791]: c # In[795]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) # In[796]: c # In[786]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): c.update(address[0].keys()) # In[821]: i = 0 for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], str): print(address[0]) i+=1 if i > 10: break # In[787]: c # md # In[797]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): c.update(address[0].keys()) # In[798]: c # #### name # In[822]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/name') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) c # In[823]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/name') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) c # In[824]: i = 0 for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/name') if address: print(address[0]) i += 1 if i>=5: break # In[825]: i = 0 for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/name') if address: print(address[0]) i += 1 if i>=5: break # #### geo # In[799]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/geo') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) # In[800]: c # In[801]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/geo') if address: if isinstance(address[0], dict) and 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' in address[0]: c.update([address[0]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0]]) else: c.update([type(address[0])]) # In[802]: c # In[803]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/geo') if address and isinstance(address[0], dict): c.update(list(address[0])) # In[804]: c # In[815]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/geo') if address and isinstance(address[0], str): print(address[0]) # In[813]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/geo') if address and isinstance(address[0], dict): c.update(list(address[0])) # In[814]: c # In[806]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/geo') if address and isinstance(address[0], dict): address = address[0] if 'http://schema.org/latitude' in address: c.update([type(address['http://schema.org/latitude'][0])]) c # In[807]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/geo') if address and isinstance(address[0], dict): address = address[0] if 'http://schema.org/latitude' in address: c.update([type(address['http://schema.org/longitude'][0])]) c # In[817]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/geo') if address and isinstance(address[0], dict): address = address[0] if 'http://schema.org/GeoCoordinates/latitude' in address: c.update([type(address['http://schema.org/GeoCoordinates/latitude'][0])]) c # In[818]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/geo') if address and isinstance(address[0], dict): address = address[0] if 'http://schema.org/GeoCoordinates/longitude' in address: print(address['http://schema.org/GeoCoordinates/longitude'][0], address['http://schema.org/GeoCoordinates/latitude'][0]) c # In[809]: i = 0 for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/geo') if address and isinstance(address[0], dict): address = address[0] if 'http://schema.org/latitude' in address: i+=1 print(address['http://schema.org/latitude'], address['http://schema.org/longitude']) if i > 10: break # ### Postal Address # In[827]: c = Counter() for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): c.update(address[0].keys()) c # In[828]: c = Counter() for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): c.update(address[0].keys()) c # #### addressCountry # In[837]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/addressCountry') if a: c.append(a[0]) c[:5] # In[834]: Counter(map(type, c)) # In[840]: Counter(k for a in c for k in a if isinstance(a, dict)) # In[841]: Counter(a['http://www.w3.org/1999/02/22-rdf-syntax-ns#type'][0] for a in c for k in a if isinstance(a, dict)) # In[853]: c = [] for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/PostalAddress/addressCountry') if a: c.append(a[0]) c[:5] # In[854]: Counter(map(type, c)) # Empty... # In[857]: [a for a in c if isinstance(a, dict)] # Country name # In[865]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/addressCountry') if a and isinstance(a[0], dict): name = a[0].get('http://schema.org/name') if name: c.append(name[0]) c[:5], len(c), Counter(map(type, c)) # #### addressLocality # In[870]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/addressLocality') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[872]: c = [] for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/PostalAddress/addressLocality') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[874]: [a for a in c if isinstance(a, dict)] # #### addressRegion # In[877]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/addressRegion') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[879]: c = [] for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/PostalAddress/addressLocality') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[880]: [a for a in c if isinstance(a, dict)] # #### postalCode # In[881]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/postalCode') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[882]: c = [] for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/PostalAddress/postalCode') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # #### streetAddress # In[884]: c = [] for x in extract_subtype('jobLocation', 'Place'): address = x.get('http://schema.org/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/streetAddress') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[885]: c = [] for x in extract_subtype('jobLocation', 'Place', False): address = x.get('http://schema.org/Place/address') if address and isinstance(address[0], dict): a = address[0].get('http://schema.org/PostalAddress/streetAddress') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # # Base Salary # In[887]: Counter(extract_types(json_graphs, 'baseSalary')), Counter(extract_types(graphs, 'JobPosting/baseSalary')) # In[893]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): dtype = x.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') if dtype: c.append(dtype[0]) Counter(c) # In[894]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): dtype = x.get('http://www.w3.org/1999/02/22-rdf-syntax-ns#type') if dtype: c.append(dtype[0]) Counter(c) # In[896]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): c += list(x) Counter(c) # In[897]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): c += list(x) Counter(c) # #### currency # In[900]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/currency') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # In[901]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/currency') if a: c.append(a[0]) c[:5], len(c), Counter(map(type, c)) # #### value # In[911]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a: c.append(a[0]) [_ for _ in c if type(_) == str][:10], len(c), Counter(map(type, c)) # In[916]: rdftype = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' # In[918]: Counter([_[rdftype][0] for _ in c if isinstance(_, dict) and rdftype in _]) # In[919]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a: c.append(a[0]) [_ for _ in c if type(_) == str][:10], len(c), Counter(map(type, c)) # In[920]: Counter([_[rdftype][0] for _ in c if isinstance(_, dict) and rdftype in _]) # ### Quantitative Value # In[930]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: c += a[0] Counter(c) # In[931]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: c += a[0] Counter(c) # #### unitText # In[935]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/unitText') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[937]: sorted(Counter(c).items(), key=lambda x:x[1], reverse=True)[:10] # In[939]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/QuantitativeValue/unitText') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # #### minValue # In[940]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/minValue') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[941]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/QuantitativeValue/minValue') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[945]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/maxValue') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[946]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/QuantitativeValue/maxValue') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[947]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/value') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # In[948]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/value') if a and isinstance(a[0], dict) and a[0].get(rdftype) == ['http://schema.org/QuantitativeValue']: v = a[0].get('http://schema.org/QuantitativeValue/value') if v: c.append(v[0]) c[:5], len(c), Counter(map(type, c)) # ### Monetary Amount minvalue # In[950]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/minValue') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[954]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/minValue') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[953]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount'): a = x.get('http://schema.org/maxValue') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[955]: c = [] for x in extract_subtype('baseSalary', 'MonetaryAmount', False): a = x.get('http://schema.org/MonetaryAmount/maxValue') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # ## Date Posted # In[610]: Counter(extract_types(json_graphs, 'datePosted')), Counter(extract_types(graphs, 'JobPosting/datePosted')) # In[672]: list(extract_property(json_graphs, 'datePosted'))[:3] # In[673]: list(extract_property(graphs, 'JobPosting/datePosted'))[:3] # ### Hiring Organization # In[612]: Counter(extract_types(json_graphs, 'hiringOrganization')), Counter(extract_types(graphs, 'JobPosting/hiringOrganization')) # In[674]: list(extract_property(json_graphs, 'hiringOrganization'))[:3] # In[675]: list(extract_property(graphs, 'JobPosting/hiringOrganization'))[:3] # In[959]: c = [] for x in extract_subtype('hiringOrganization', 'Organization'): c += x Counter(c) # In[960]: c = [] for x in extract_subtype('hiringOrganization', 'Organization', False): c += x Counter(c) # #### name # In[961]: c = [] for x in extract_subtype('hiringOrganization', 'Organization'): a = x.get('http://schema.org/name') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[962]: c = [] for x in extract_subtype('hiringOrganization', 'Organization', False): a = x.get('http://schema.org/Organization/name') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # #### sameAs # In[963]: c = [] for x in extract_subtype('hiringOrganization', 'Organization'): a = x.get('http://schema.org/sameAs') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[964]: c = [] for x in extract_subtype('hiringOrganization', 'Organization', False): a = x.get('http://schema.org/Organization/sameAs') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # #### logo # In[987]: c = [] for x in extract_subtype('hiringOrganization', 'Organization'): a = x.get('http://schema.org/logo') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[988]: Counter([a[rdftype][0] for a in c if isinstance(a, dict) and rdftype in a]) # In[989]: Counter([k for a in c if isinstance(a, dict) and rdftype in a for k in a]) # In[990]: [a for a in c if isinstance(a, dict) and rdftype in a][:10] # In[991]: c = [] for x in extract_subtype('hiringOrganization', 'Organization', False): a = x.get('http://schema.org/Organization/logo') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[992]: Counter([k for a in c if isinstance(a, dict) and rdftype in a for k in a]) # In[993]: [a for a in c if isinstance(a, dict) and rdftype in a][:10] # #### url # In[975]: c = [] for x in extract_subtype('hiringOrganization', 'Organization'): a = x.get('http://schema.org/url') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # In[976]: c = [] for x in extract_subtype('hiringOrganization', 'Organization', False): a = x.get('http://schema.org/Organization/url') if a: c.append(a[0]) c[:10], len(c), Counter(map(type, c)) # ## validThrough # In[613]: Counter(extract_types(json_graphs, 'validThrough')), Counter(extract_types(graphs, 'JobPosting/validThrough')) # In[676]: list(extract_property(json_graphs, 'validThrough'))[:3] # In[678]: list(extract_property(graphs, 'JobPosting/validThrough'))[:3] # ### url # In[614]: Counter(extract_types(json_graphs, 'url')), Counter(extract_types(graphs, 'JobPosting/url')) # In[679]: list(extract_property(json_graphs, 'url'))[:3] # In[680]: list(extract_property(graphs, 'JobPosting/url'))[:3] # ### industry # In[615]: Counter(extract_types(json_graphs, 'industry')), Counter(extract_types(graphs, 'JobPosting/industry')) # In[616]: pd.Series(industry for industries in extract_property(json_graphs, 'industry') for industry in industries).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # In[617]: pd.Series(industry for industries in extract_property(graphs, 'JobPosting/industry') for industry in industries).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # ### educationRequirements # In[681]: Counter(extract_types(json_graphs, 'educationRequirements')), Counter(extract_types(graphs, 'JobPosting/educationRequirements')) # In[682]: pd.Series(x for xs in extract_property(json_graphs, 'educationRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # In[683]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/educationRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # ### workHours # In[627]: Counter(extract_types(json_graphs, 'workHours')), Counter(extract_types(graphs, 'JobPosting/workHours')) # In[686]: list(extract_property(json_graphs, 'workHours'))[:10] # In[687]: list(extract_property(graphs, 'JobPosting/workHours'))[:10] # ### experienceRequirements # In[688]: Counter(extract_types(json_graphs, 'experienceRequirements')), Counter(extract_types(graphs, 'JobPosting/experienceRequirements')) # In[691]: pd.Series(x for xs in extract_property(json_graphs, 'experienceRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(6).T # In[692]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/experienceRequirements') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(6).T # ### occupationalCategory # In[634]: Counter(extract_types(json_graphs, 'occupationalCategory')), Counter(extract_types(graphs, 'JobPosting/occupationalCategory')) # In[636]: pd.Series(x for xs in extract_property(json_graphs, 'occupationalCategory') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # In[637]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/occupationalCategory') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(20).T # ### qualifications # In[640]: Counter(extract_types(json_graphs, 'qualifications')), Counter(extract_types(graphs, 'JobPosting/qualifications')) # In[643]: pd.Series(x for xs in extract_property(json_graphs, 'qualifications') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(4).T # In[646]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/qualifications') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(4).T # ### identifier # In[650]: Counter(extract_types(json_graphs, 'identifier')), Counter(extract_types(graphs, 'JobPosting/identifier')) # In[693]: list(extract_property(json_graphs, 'identifier'))[:3] # In[694]: list(extract_property(graphs, 'JobPosting/identifier'))[:3] # ### salaryCurrency # In[651]: Counter(extract_types(json_graphs, 'salaryCurrency')), Counter(extract_types(graphs, 'JobPosting/salaryCurrency')) # In[654]: pd.Series(x for xs in extract_property(json_graphs, 'salaryCurrency') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # In[695]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/salaryCurrency') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # ### employmentType # In[698]: Counter(extract_types(json_graphs, 'employmentType')), Counter(extract_types(graphs, 'JobPosting/employmentType')) # In[696]: pd.Series(x for xs in extract_property(json_graphs, 'employmentType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # In[697]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/employmentType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # When it's multiple it's normally a listing # In[584]: list(x for x in extract_property(json_graphs, 'employmentType') if len(x) > 1)[:20] # ### jobBenefits # In[656]: Counter(extract_types(json_graphs, 'jobBenefits')), Counter(extract_types(graphs, 'JobPosting/jobBenefits')) # In[700]: list(extract_property(json_graphs, 'jobBenefits'))[:10] # In[702]: list(extract_property(graphs, 'JobPosting/jobBenefits'))[:3] # ### Skills # In[703]: Counter(extract_types(json_graphs, 'skills')), Counter(extract_types(graphs, 'JobPosting/skills')) # In[704]: list(extract_property(json_graphs, 'skills'))[:10] # In[705]: list(extract_property(graphs, 'JobPosting/skills'))[:3] # ### image # In[663]: Counter(extract_types(json_graphs, 'image')), Counter(extract_types(graphs, 'JobPosting/image')) # In[708]: list(extract_property(json_graphs, 'image'))[:3] # In[707]: list(extract_property(graphs, 'JobPosting/image'))[:3] # ### jobLocationType # In[709]: Counter(extract_types(json_graphs, 'jobLocationType')), Counter(extract_types(graphs, 'JobPosting/jobLocationType')) # In[712]: pd.Series(x for xs in extract_property(json_graphs, 'jobLocationType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # In[713]: pd.Series(x for xs in extract_property(graphs, 'JobPosting/jobLocationType') for x in xs if type(x) != dict).value_counts().to_frame().assign(pct=lambda df: df[0]/sum(df[0])).head(10).T # ### incentiveCompensation # In[714]: Counter(extract_types(json_graphs, 'incentiveCompensation')), Counter(extract_types(graphs, 'JobPosting/incentiveCompensation')) # In[715]: list(extract_property(json_graphs, 'incentiveCompensation'))[:10] # In[717]: list(extract_property(graphs, 'JobPosting/incentiveCompensation'))[:10]