#!/usr/bin/env python # coding: utf-8 # In[1]: # !pip install bs4 # In[2]: import bs4 as bs, urllib, pandas as pd, numpy as np # Parse past X years # In[3]: keyword='medve' baseurl=u'https://szekelyhon.ro/kereses?op=search&src_words=' # Modified to not include dates - to comply with the new news format # In[4]: def extractor(page): print('Parsing... page',page) url=baseurl+keyword+'&page='+str(page) html = urllib.request.urlopen(url).read() # soup = bs.BeautifulSoup(html,'lxml') soup = bs.BeautifulSoup(html,"html.parser") return soup.findAll("div", {"class": "cikkocka2c"}) # In[5]: divs=[] for i in range(1,13): divs.append(extractor(i)) # In[6]: def date_hu_en(i): date=i[6:-4] if date=='augusztus': m='08' elif date=='december': m='12' elif date=='február': m='02' elif date=='január': m='01' elif date=='július': m='07' elif date=='június': m='06' elif date=='május': m='05' elif date=='március': m='03' elif date=='november': m='11' elif date==u'október': m='10' elif date==u'szeptember': m='09' elif date==u'április': m='04' else: return date return i[:4]+'-'+m+'-'+i[-3:-1] # In[7]: def find_all(s, ch): return [i for i, letter in enumerate(s) if letter == ch] # In[8]: from utils import text_processor # In[9]: hirek=[] tagset=set() for i in range(len(divs)-1): divgroup=divs[i] for div in divgroup: icat='' img=div.find('img') if img !=None: img=img['src'] #infer image category from image link icats=find_all(img,'/') if len(icats)>4: icat=img[icats[3]+1:icats[4]] tags=div.find("div", {"class": "tags_con1"}) if tags!=None: tags=[j.text.strip() for j in tags.findAll('div')] idiv=div.find("div", {"class": "catinner"}) if idiv!=None: idiv=idiv.find('div') content=div.find('p') date=idiv.text[idiv.text.find('20'):idiv.text.find(',')] title=div.find('h2').text print(title) if content==None: sdiv=str(div)[::-1] content=sdiv[:sdiv.find('>a/<')].replace('\r','').replace('\t','').replace('\n','')[::-1][:-6] else: content=content.text content=content.replace('
','') link=div.findAll('a')[-1]['href'] #infer category from link cats=find_all(link,'/') if len(cats)>3: cat=link[cats[2]+1:cats[3]] else: cat='' #infer attack from plain text relevant,severity,deaths=text_processor(title,content) if tags!=None: notags=[u'Húsvét',u'Film',u'Egészségügy',u'Külföld',u'Színház',u'Ünnep'] for notag in notags: if notag in tags: relevant=-1 break if ((relevant>-1)&\ (cat not in ['sport','muvelodes','sms-e-mail-velemeny','tusvanyos'])&\ (title not in [u'Röviden'])): if tags!=None: tagset=tagset.union(set(tags)) if 'medve' in tags: relevant=1 hirek.append({'date':date_hu_en(date), 'hudate':date, 'title':title, 'image':img, 'tags':repr(tags), 'content':content, 'link':link, 'category':cat, 'icategory':icat, 'relevant':relevant, 'severity':severity, 'deaths':deaths, 'duplicate':0 }) # Összes medvés hír # In[10]: df=pd.DataFrame().from_dict(hirek) df['date']=pd.to_datetime(df['date']) df=df.sort_values('date').drop_duplicates().reset_index(drop=True) # In[11]: len(hirek) # Save to medve Excel. Manual curation # In[12]: dm=df[[ 'date', 'hudate', 'link','image', 'category','icategory','tags','title', 'content']] dc=df[['title','content','relevant', 'severity','deaths','duplicate']] # In[13]: # !pip install openpyxl # In[14]: #save parsed data dm.to_excel('data/szekelyhon_medve.xlsx') # In[15]: #save data for curation #1 if you dont have savedata yet existing_savedata=False if not existing_savedata: dc.to_excel('data/szekelyhon_medve_curated.xlsx') #2 if you already have savedata else: dc2=pd.read_excel('data/szekelyhon_medve_curated.xlsx') dc2.combine_first(dc).to_excel('data/szekelyhon_medve_curated.xlsx') # Open `data/szekelyhon_medve_curated` and manually confirm each case. # Relevant = Medves cikk-e vagy sem: 1-igen, 0-nem biztos, -1:biztosan nem # Deaths = Halalok szama (ha ismert) # Severity = Sulyossag: 0-mas jellegu hir, 1-nyom, 2-latas, 3-allat-tamadas, 4-ember-tamadas # Duplicate = 0: Eredeti cikk, 1: Masolat, 2: Osszegzes