import bs4 as bs, urllib, pandas as pd, numpy as np
Parse past X years
keyword='medve'
baseurl=u'https://szekelyhon.ro/kereses?op=search&src_words='
start='2002-12'
#end='2018-11-01'
#start='2018-10'
end='2018-12'
dates=[]
datelist = pd.date_range(start=pd.to_datetime(start), end=pd.to_datetime(end), freq='M').tolist()
for date in datelist:
dates.append(str(date)[:10])
dates[:5]
['2002-12-31', '2003-01-31', '2003-02-28', '2003-03-31', '2003-04-30']
def extractor(time1,time2):
time1=dates[i]
time2=dates[i+1]
print('Parsing...',time1,'-',time2)
url=baseurl+keyword+'&src_time1='+time1+'&src_time2='+time2
html = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(html,'lxml')
return soup.findAll("div", {"class": "cikkocka2c"})
divs=[]
for i in range(len(dates)-1):
time1=dates[i]
time2=dates[i+1]
divs.append(extractor(time1,time2))
Parsing... 2002-12-31 - 2003-01-31 Parsing... 2003-01-31 - 2003-02-28 Parsing... 2003-02-28 - 2003-03-31 Parsing... 2003-03-31 - 2003-04-30 Parsing... 2003-04-30 - 2003-05-31 Parsing... 2003-05-31 - 2003-06-30 Parsing... 2003-06-30 - 2003-07-31 Parsing... 2003-07-31 - 2003-08-31 Parsing... 2003-08-31 - 2003-09-30 Parsing... 2003-09-30 - 2003-10-31 Parsing... 2003-10-31 - 2003-11-30 Parsing... 2003-11-30 - 2003-12-31 Parsing... 2003-12-31 - 2004-01-31 Parsing... 2004-01-31 - 2004-02-29 Parsing... 2004-02-29 - 2004-03-31 Parsing... 2004-03-31 - 2004-04-30 Parsing... 2004-04-30 - 2004-05-31 Parsing... 2004-05-31 - 2004-06-30 Parsing... 2004-06-30 - 2004-07-31 Parsing... 2004-07-31 - 2004-08-31 Parsing... 2004-08-31 - 2004-09-30 Parsing... 2004-09-30 - 2004-10-31 Parsing... 2004-10-31 - 2004-11-30 Parsing... 2004-11-30 - 2004-12-31 Parsing... 2004-12-31 - 2005-01-31 Parsing... 2005-01-31 - 2005-02-28 Parsing... 2005-02-28 - 2005-03-31 Parsing... 2005-03-31 - 2005-04-30 Parsing... 2005-04-30 - 2005-05-31 Parsing... 2005-05-31 - 2005-06-30 Parsing... 2005-06-30 - 2005-07-31 Parsing... 2005-07-31 - 2005-08-31 Parsing... 2005-08-31 - 2005-09-30 Parsing... 2005-09-30 - 2005-10-31 Parsing... 2005-10-31 - 2005-11-30 Parsing... 2005-11-30 - 2005-12-31 Parsing... 2005-12-31 - 2006-01-31 Parsing... 2006-01-31 - 2006-02-28 Parsing... 2006-02-28 - 2006-03-31 Parsing... 2006-03-31 - 2006-04-30 Parsing... 2006-04-30 - 2006-05-31 Parsing... 2006-05-31 - 2006-06-30 Parsing... 2006-06-30 - 2006-07-31 Parsing... 2006-07-31 - 2006-08-31 Parsing... 2006-08-31 - 2006-09-30 Parsing... 2006-09-30 - 2006-10-31 Parsing... 2006-10-31 - 2006-11-30 Parsing... 2006-11-30 - 2006-12-31 Parsing... 2006-12-31 - 2007-01-31 Parsing... 2007-01-31 - 2007-02-28 Parsing... 2007-02-28 - 2007-03-31 Parsing... 2007-03-31 - 2007-04-30 Parsing... 2007-04-30 - 2007-05-31 Parsing... 2007-05-31 - 2007-06-30 Parsing... 2007-06-30 - 2007-07-31 Parsing... 2007-07-31 - 2007-08-31 Parsing... 2007-08-31 - 2007-09-30 Parsing... 2007-09-30 - 2007-10-31 Parsing... 2007-10-31 - 2007-11-30 Parsing... 2007-11-30 - 2007-12-31 Parsing... 2007-12-31 - 2008-01-31 Parsing... 2008-01-31 - 2008-02-29 Parsing... 2008-02-29 - 2008-03-31 Parsing... 2008-03-31 - 2008-04-30 Parsing... 2008-04-30 - 2008-05-31 Parsing... 2008-05-31 - 2008-06-30 Parsing... 2008-06-30 - 2008-07-31 Parsing... 2008-07-31 - 2008-08-31 Parsing... 2008-08-31 - 2008-09-30 Parsing... 2008-09-30 - 2008-10-31 Parsing... 2008-10-31 - 2008-11-30 Parsing... 2008-11-30 - 2008-12-31 Parsing... 2008-12-31 - 2009-01-31 Parsing... 2009-01-31 - 2009-02-28 Parsing... 2009-02-28 - 2009-03-31 Parsing... 2009-03-31 - 2009-04-30 Parsing... 2009-04-30 - 2009-05-31 Parsing... 2009-05-31 - 2009-06-30 Parsing... 2009-06-30 - 2009-07-31 Parsing... 2009-07-31 - 2009-08-31 Parsing... 2009-08-31 - 2009-09-30 Parsing... 2009-09-30 - 2009-10-31 Parsing... 2009-10-31 - 2009-11-30 Parsing... 2009-11-30 - 2009-12-31 Parsing... 2009-12-31 - 2010-01-31 Parsing... 2010-01-31 - 2010-02-28 Parsing... 2010-02-28 - 2010-03-31 Parsing... 2010-03-31 - 2010-04-30 Parsing... 2010-04-30 - 2010-05-31 Parsing... 2010-05-31 - 2010-06-30 Parsing... 2010-06-30 - 2010-07-31 Parsing... 2010-07-31 - 2010-08-31 Parsing... 2010-08-31 - 2010-09-30 Parsing... 2010-09-30 - 2010-10-31 Parsing... 2010-10-31 - 2010-11-30 Parsing... 2010-11-30 - 2010-12-31 Parsing... 2010-12-31 - 2011-01-31 Parsing... 2011-01-31 - 2011-02-28 Parsing... 2011-02-28 - 2011-03-31 Parsing... 2011-03-31 - 2011-04-30 Parsing... 2011-04-30 - 2011-05-31 Parsing... 2011-05-31 - 2011-06-30 Parsing... 2011-06-30 - 2011-07-31 Parsing... 2011-07-31 - 2011-08-31 Parsing... 2011-08-31 - 2011-09-30 Parsing... 2011-09-30 - 2011-10-31 Parsing... 2011-10-31 - 2011-11-30 Parsing... 2011-11-30 - 2011-12-31 Parsing... 2011-12-31 - 2012-01-31 Parsing... 2012-01-31 - 2012-02-29 Parsing... 2012-02-29 - 2012-03-31 Parsing... 2012-03-31 - 2012-04-30 Parsing... 2012-04-30 - 2012-05-31 Parsing... 2012-05-31 - 2012-06-30 Parsing... 2012-06-30 - 2012-07-31 Parsing... 2012-07-31 - 2012-08-31 Parsing... 2012-08-31 - 2012-09-30 Parsing... 2012-09-30 - 2012-10-31 Parsing... 2012-10-31 - 2012-11-30 Parsing... 2012-11-30 - 2012-12-31 Parsing... 2012-12-31 - 2013-01-31 Parsing... 2013-01-31 - 2013-02-28 Parsing... 2013-02-28 - 2013-03-31 Parsing... 2013-03-31 - 2013-04-30 Parsing... 2013-04-30 - 2013-05-31 Parsing... 2013-05-31 - 2013-06-30 Parsing... 2013-06-30 - 2013-07-31 Parsing... 2013-07-31 - 2013-08-31 Parsing... 2013-08-31 - 2013-09-30 Parsing... 2013-09-30 - 2013-10-31 Parsing... 2013-10-31 - 2013-11-30 Parsing... 2013-11-30 - 2013-12-31 Parsing... 2013-12-31 - 2014-01-31 Parsing... 2014-01-31 - 2014-02-28 Parsing... 2014-02-28 - 2014-03-31 Parsing... 2014-03-31 - 2014-04-30 Parsing... 2014-04-30 - 2014-05-31 Parsing... 2014-05-31 - 2014-06-30 Parsing... 2014-06-30 - 2014-07-31 Parsing... 2014-07-31 - 2014-08-31 Parsing... 2014-08-31 - 2014-09-30 Parsing... 2014-09-30 - 2014-10-31 Parsing... 2014-10-31 - 2014-11-30 Parsing... 2014-11-30 - 2014-12-31 Parsing... 2014-12-31 - 2015-01-31 Parsing... 2015-01-31 - 2015-02-28 Parsing... 2015-02-28 - 2015-03-31 Parsing... 2015-03-31 - 2015-04-30 Parsing... 2015-04-30 - 2015-05-31 Parsing... 2015-05-31 - 2015-06-30 Parsing... 2015-06-30 - 2015-07-31 Parsing... 2015-07-31 - 2015-08-31 Parsing... 2015-08-31 - 2015-09-30 Parsing... 2015-09-30 - 2015-10-31 Parsing... 2015-10-31 - 2015-11-30 Parsing... 2015-11-30 - 2015-12-31 Parsing... 2015-12-31 - 2016-01-31 Parsing... 2016-01-31 - 2016-02-29 Parsing... 2016-02-29 - 2016-03-31 Parsing... 2016-03-31 - 2016-04-30 Parsing... 2016-04-30 - 2016-05-31 Parsing... 2016-05-31 - 2016-06-30 Parsing... 2016-06-30 - 2016-07-31 Parsing... 2016-07-31 - 2016-08-31 Parsing... 2016-08-31 - 2016-09-30 Parsing... 2016-09-30 - 2016-10-31 Parsing... 2016-10-31 - 2016-11-30 Parsing... 2016-11-30 - 2016-12-31 Parsing... 2016-12-31 - 2017-01-31 Parsing... 2017-01-31 - 2017-02-28 Parsing... 2017-02-28 - 2017-03-31 Parsing... 2017-03-31 - 2017-04-30 Parsing... 2017-04-30 - 2017-05-31 Parsing... 2017-05-31 - 2017-06-30 Parsing... 2017-06-30 - 2017-07-31 Parsing... 2017-07-31 - 2017-08-31 Parsing... 2017-08-31 - 2017-09-30 Parsing... 2017-09-30 - 2017-10-31 Parsing... 2017-10-31 - 2017-11-30 Parsing... 2017-11-30 - 2017-12-31 Parsing... 2017-12-31 - 2018-01-31 Parsing... 2018-01-31 - 2018-02-28 Parsing... 2018-02-28 - 2018-03-31 Parsing... 2018-03-31 - 2018-04-30 Parsing... 2018-04-30 - 2018-05-31 Parsing... 2018-05-31 - 2018-06-30 Parsing... 2018-06-30 - 2018-07-31 Parsing... 2018-07-31 - 2018-08-31 Parsing... 2018-08-31 - 2018-09-30 Parsing... 2018-09-30 - 2018-10-31 Parsing... 2018-10-31 - 2018-11-30
def date_hu_en(i):
date=i[6:-4]
if date=='augusztus': m='08'
elif date=='december': m='12'
elif date=='február': m='02'
elif date=='január': m='01'
elif date=='július': m='07'
elif date=='június': m='06'
elif date=='május': m='05'
elif date=='március': m='03'
elif date=='november': m='11'
elif date==u'október': m='10'
elif date==u'szeptember': m='09'
elif date==u'április': m='04'
else: return date
return i[:4]+'-'+m+'-'+i[-3:-1]
def find_all(s, ch):
return [i for i, letter in enumerate(s) if letter == ch]
Relevant = Medves cikk-e vagy sem: 1-igen, 0-nem biztos, -1:biztosan nem
Deaths = Halalok szama (ha ismert)
Severity = Sulyossag: 0-mas jellegu hir, 1-latas, 2-allat-tamadas, 3-ember-tamadas
Duplicate = 0: Eredeti cikk, 1: Masolat, 2: Osszegzes
def text_processor(title,content):
relevant=0
severity=0
deaths=0
tamadas=[u'támad',u'sebes']
for i in tamadas:
if i in title+content:
relevant=1
severity=2
tamadas=[u'halál',u'áldozat',u'ölt ',u'pusztít']
for i in tamadas:
if i in title+content:
relevant=1
severity=3
tamadas=[u'medve',u'medvé']
for i in tamadas:
if i in title.replace(',',' ').replace('.',' ').lower():
relevant=1
tamadas=[u'medvegyev',u'jegesmedvék',u'medvehagyma',u'aranymedve',u'szibéria',u' kupa',
u'jégkorong',u'kosárlabda',u'labdarúgás',u'labdarúgó',u'steaua',
u'c osztály',u'berlin',u'állatkert',u'medve-tó',u'oroszorsz',u' orosz ']
for i in tamadas:
if i in (title+content).replace(',',' ').replace('.',' ').lower():
relevant=-1
return relevant,severity,deaths
hirek=[]
tagset=set()
for i in range(len(dates)-1):
time2=dates[i+1]
divgroup=divs[i]
for div in divgroup:
icat=''
img=div.find('img')
if img !=None:
img=img['src']
#infer image category from image link
icats=find_all(img,'/')
if len(icats)>4:
icat=img[icats[3]+1:icats[4]]
tags=div.find("div", {"class": "tags_con1"})
if tags!=None:
tags=[j.text.strip() for j in tags.findAll('div')]
idiv=div.find("div", {"class": "catinner"})
if idiv!=None:
idiv=idiv.find('div')
content=div.find('p')
date=idiv.text[idiv.text.find('20'):idiv.text.find(',')]
title=div.find('h2').text
if content==None:
sdiv=str(div)[::-1]
content=sdiv[:sdiv.find('>a/<')].replace('\r','').replace('\t','').replace('\n','')[::-1][:-6]
else: content=content.text
content=content.replace('</div><div class="clear"></div></div><div class="clear"></div>','')
link=div.findAll('a')[-1]['href']
#infer category from link
cats=find_all(link,'/')
if len(cats)>3:
cat=link[cats[2]+1:cats[3]]
else: cat=''
#infer attack from plain text
relevant,severity,deaths=text_processor(title,content)
if tags!=None:
notags=[u'Húsvét',u'Film',u'Egészségügy',u'Külföld',u'Színház',u'Ünnep']
for notag in notags:
if notag in tags:
relevant=-1
break
if ((relevant>-1)&\
(cat not in ['sport','muvelodes','sms-e-mail-velemeny','tusvanyos'])&\
(title not in [u'Röviden'])):
if tags!=None:
tagset=tagset.union(set(tags))
if 'medve' in tags:
relevant=1
hirek.append({'date':date_hu_en(date),
'hudate':date,
'title':title,
'image':img,
'tags':repr(tags),
'content':content,
'link':link,
'category':cat,
'icategory':icat,
'relevant':relevant,
'severity':severity,
'deaths':deaths,
'duplicate':0
})
Összes medvés hír
df=pd.DataFrame().from_dict(hirek)
df['date']=pd.to_datetime(df['date'])
df=df.sort_values('date').drop_duplicates().reset_index(drop=True)
len(hirek)
1168
Save to medve Excel. Manual curation
df.columns
Index(['category', 'content', 'date', 'deaths', 'duplicate', 'hudate', 'icategory', 'image', 'link', 'relevant', 'severity', 'tags', 'title'], dtype='object')
dm=df[[ 'date', 'hudate', 'link','image', 'category','icategory','tags','title',
'content']]
dc=df[['title','content','relevant', 'severity','deaths','duplicate']]
#save parsed data
dm.to_excel('szekelyhon_medve.xlsx')
#save data for curation
#1 if you dont have savedata yet
existing_savedata=False
if not existing_savedata:
dc.to_excel('szekelyhon_medve_curated.xlsx')
#2 if you already have savedata
else:
dc2=pd.read_excel('szekelyhon_medve_curated.xlsx')
dc2.combine_first(dc).to_excel('szekelyhon_medve_curated.xlsx')
dr=pd.read_excel('data/szekelyhon_medve_curated_manual.xlsx')
dr=dr[['content','relevant','deaths','severity','duplicate']].set_index('content')
relevant=[]
deaths=[]
severity=[]
duplicate=[]
for i in range(len(dc.index)):
if dc.loc[i]['content']!='':
if dc.loc[i]['content'] in dr.index:
dk=dr.loc[dc.loc[i]['content']]
else:
dk=dc.loc[i]
else:
dk=dc.loc[i]
relevant.append(np.array(dk['relevant']).flatten()[0])
deaths.append(np.array(dk['deaths']).flatten()[0])
severity.append(np.array(dk['severity']).flatten()[0])
duplicate.append(np.array(dk['duplicate']).flatten()[0])
dc['relevant']=relevant
dc['deaths']=deaths
dc['severity']=severity
dc['duplicate']=duplicate
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel. C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path.
dc.to_excel('szekelyhon_medve_curated.xlsx')