import nltk nltk.download('stopwords') from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.metrics import accuracy_score, confusion_matrix from lightgbm import LGBMClassifier from nltk.corpus import stopwords from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np import urllib3 http = urllib3.PoolManager() # PoolManager urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Desabilitando avisos url = "https://raw.githubusercontent.com/roneysco/Fake.br-Corpus/master/full_texts/fake/%d.txt" % 1 # URL da notícia r = http.request('GET', url) # Copiando notícia print(r.data.decode('utf-8')) # Print com o texto decodificado df = pd.DataFrame(columns = ['noticia', 'label']) for i in range(1, 3601): url = "https://raw.githubusercontent.com/roneysco/Fake.br-Corpus/master/full_texts/fake/%d.txt" % i # URL da notícia r = http.request('GET', url) # Copiando notícia noticia = pd.DataFrame([[r.data.decode('utf-8'), 'fake']], columns = ['noticia', 'label']) df = df.append(noticia, ignore_index=True) for i in range(1, 3601): url = "https://raw.githubusercontent.com/roneysco/Fake.br-Corpus/master/full_texts/true/%d.txt" % i # URL da notícia r = http.request('GET', url) # Copiando notícia noticia = pd.DataFrame([[r.data.decode('utf-8'), 'true']], columns = ['noticia', 'label']) df = df.append(noticia, ignore_index=True) df.head() df.tail() df2 = pd.read_csv('https://raw.githubusercontent.com/ViniciusNunes0/SIRENE-news/master/noticias-sirene.csv', sep=';') df2 = df2[['noticia', 'classificacao']] df2 = df2.rename(columns={"noticia": "noticia", "classificacao": "label"}) df2['label'] = df2['label'].replace({0: 'true', 1: 'fake'}) df2.head() df = df.append(df2, ignore_index=True) x_train,x_test,y_train,y_test=train_test_split(df['noticia'], df['label'], test_size=0.2, random_state=42) tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('portuguese'), analyzer='word', ngram_range=(1, 1), lowercase=True, use_idf=True) tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) pac = PassiveAggressiveClassifier(max_iter=50, random_state=0) pac.fit(tfidf_train,y_train) y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test,y_pred) print(f'Accuracy: {round(score*100,2)}%') rf = RandomForestClassifier(random_state=0) rf.fit(tfidf_train,y_train) y_pred = rf.predict(tfidf_test) score = accuracy_score(y_test,y_pred) print(f'Accuracy: {round(score*100,2)}%') lgbm = LGBMClassifier(learning_rate=0.1, num_leaves=128, min_child_samples=100, ubsample=0.96, colsample_bytree=0.28, random_state=0, subsample_freq=1, n_estimators=100) lgbm.fit(tfidf_train,y_train) y_pred = lgbm.predict(tfidf_test) score = accuracy_score(y_test,y_pred) print(f'Accuracy: {round(score*100,2)}%') !pip install scikit-optimize from skopt import dummy_minimize def treinar_modelo(params): learning_rate = params[0] num_leaves = params[1] min_child_samples = params[2] subsample = params[3] colsample_bytree = params[4] print(params, '\n') mdl = LGBMClassifier(learning_rate=learning_rate, num_leaves=num_leaves, min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, random_state=0, subsample_freq=1, n_estimators=100) mdl.fit(tfidf_train,y_train) y_pred = mdl.predict(tfidf_test) return -accuracy_score(y_test,y_pred) space = [(1e-3, 1e-1, 'log-uniform'), #learning rate (2, 128), # num_leaves (1, 100), # min_child_samples (0.05, 1.0), # subsample (0.1, 1.0)] # colsample bytree resultado = dummy_minimize(treinar_modelo, space, random_state=1, verbose=1, n_calls=30) resultado.x from skopt import gp_minimize resultados_gp = gp_minimize(treinar_modelo, space, random_state=1, verbose=1, n_calls=50, n_random_starts=10) resultados_gp.x [0.1, 23, 51, 0.8039348617451548, 0.34702698317511055]