모범답안 1¶

In [ ]:

import requests
import re
import string
import html
def countWords(url):
    req = requests.get(url)
    source = html.unescape(req.text).replace('><', '>\n<')
    line = source.split('\n')
    regex = re.compile('>.*?<')
    words = []
    for i in line:
        if ('<script>' in i) or ('<style>' in i): continue
        g = regex.search(i)
        if g != None:
            result = [r for r in g.group()[1:-1].strip().split()]
            for r in range(len(result)):
                for p in string.punctuation:
                     result[r] = result[r].replace(p, '')
            words.extend([w for w in result if w != ''])
    resultDic = {}
    for i in words:
        resultDic[i] = len(source.split(i))
    return resultDic
urls = ['http://cse.koreatech.ac.kr', 'https://www.koreatech.ac.kr', 'http://www.naver.com', 'http://www.daum.net', 'http://www.nytimes.com']
for u in urls:
    print(countWords(u), end='\n\n')

모범답안2¶

In [1]:

def extractText(source): #HTML태그를 제외하고 순수 텍스트만 추출하는 함수
    text = "" #순수 텍스트 식별자
    while(1) :      
        source = source[source.find('<'):] #'<'를 찾는다
        if (source.find("<!--") == 0) : 
                source = source[source.find("-->")+3:] #주석 내용 건너뜀 
        elif (source.find("<script") == 0) :        
                source = source[source.find("</script>")+9:]  #자바스크립트 내용 건너뜀   
        elif (source.find("<style") == 0) :
                source = source[source.find("</style>")+9:] #스타일 내용 건너뜀
        else :       
            source = source[source.find('>')+1:] #'>'를 찾는다
            for ch in source :    #'>' 이후부터 한글자씩 가져온다
                if ch == '<' : break    # 다시 '<' 문자 등장하면 종료
                elif ch == '\t' or ch == '\n' : continue    #탭,개행 문자는 순수 텍스트에 포함x
                else : text += ch   #순수 텍스트 추가     
            if text.endswith(" ") == False : text += " " #공백으로 분리하기 위해 순수 텍스트 추출 후 뒤에 공백문자 추가
        if(source.find('<') == -1):break #'<' 문자가 더 이상 없는 경우 무한루프종료 
    return text 
import string

def replaceAll(text, dic):  ##문자열을 사전에 맵핑된 문자로 대체로 하는 함수
    for i,j in dic.items():
        text = text.replace(i,j,text.count(i)) #문자열에 사전에 키와 같은 문자가 존재하면 키에 맵핑된 value값으로 대체한다.
    return text   

def stripSpecialChar(text): ##특수문자 제거 함수
    keys = [ch for ch in string.punctuation] #구두문자를 key리스트로 만듬
    keys.append("★") #구두문자 이외의 특수문자를 key리스트에 추가
    keys.append("☆")
    keys.append("※")
    keys.append("nbsp")#html문자
    keys.append("lt")
    keys.append("amp")
    keys.append("quot")
    keys.append("middot")
    keys.append("uarr")
    keys.append("rarr")
    keys.append("darr")
    keys.append("harr")
    
    values = ['' for _ in keys]  #key리스트의 원소 수 만큼 ''문자를 원소를 갖는 value리스트를 만듬
    replaceDic = dict(zip(keys,values)) #key리스트 key로 value리스트를 value값으로 하는 사전 만듬

    return replaceAll(text,replaceDic) #추출한 순수 텍스트에서 사전에 맵핑된 제거할 문자들을 모두 제거
def wordOfFreq(l): #단어의 빈도수를 사전으로 반환하는 함수
    word = []   
    for w in l: #리스트의 각 원소들이
        if (w in word) == False : word.append(w) #word리스트에 포함된 단어가 아니면 단어를 리스트에 추가 (단어중복제거)          
    freg = [l.count(w) for w in l] #리스트에서 단어의 출현 빈도수를 리스트로 만듬
    
    return dict(zip(word,freg)) #단어와 단어의 빈도수로 맵핑된 사전 반환
import requests

req = requests.get('http://cse.koreatech.ac.kr')
source = req.text
text =  extractText(source)  #순수 텍스트 추출
text = stripSpecialChar(text) #특수문자 제거
l = text.split()            #공백단위분리
print(wordOfFreq(l)) #출현한 단어의 빈도수

모범답안3¶

In [ ]:

import requests
import re
import string  

def call(b):  
    req = requests.get(b)
    source = req.text
    text1 = re.sub('<script.*?>.*?</script>', ' ', source, 0, re.I|re.S)
    text2 = re.sub('<style.*?>.*?</style>', ' ', text1, 0, re.I|re.S)
    text3 = re.sub('<.+?>', ' ', text2, 0, re.I|re.S)
    text4 = re.sub('-->', ' ', text3, 0, re.I|re.S)
    text5 = re.sub("[!|?|<|>|:|\[|\]|#|$|%|&|\(|\)|*|+|\-|,|.|/|;|=|@|^|_|`|{|}|~|\"|\'|\\\|/]",' ', text4, 0, re.I|re.S)
    s=text5.split()
    dic={}
    j=0
    for i in s:
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
    print(dic)

print('http://cse.koreatech.ac.kr')    
call('http://cse.koreatech.ac.kr')
print()

print('https://www.koreatech.ac.kr')
call('https://www.koreatech.ac.kr')
print()

print('http://www.naver.com')
call('http://www.naver.com')
print()

print('http://www.daum.net')
call('http://www.daum.net')
print()

print('http://www.nytimes.com')
call('http://www.nytimes.com')