from torchvision.datasets.utils import download_and_extract_archive
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
url = 'https://download.pytorch.org/tutorial/data.zip'
download_and_extract_archive(url, '.')
path = '/content/data/names/'
names, targets, vocabs
class Preprocess:
def __init__(self, path):
self.path = path
def get_names_targets(self):
self.corpus = []
for files in Path(self.path).glob('*.txt'):
with open(files) as f:
for l in f.readlines():
self.corpus.append((l.split('\n')[0], files.stem))
return self.corpus
def get_targets(self):
targets = []
for files in Path(self.path).glob('*.txt'):
targets.append(files.stem)
return targets
def get_vocab(self):
vocab = set()
for name, target in self.corpus:
vocab.add(name)
return list(vocab)
class LoadDataset(Dataset):
def __init__(self, names_targets, names_vocab, target_vocab):
self.name_target = names_targets
self.name_vocab = names_vocab
self.target_vocab = target_vocab
def __getitem__(self, idx):
item = self.name_target[idx]
return self.name_vocab.index(item[0]), self.target_vocab.index(item[1])
def __len__(self): return len(self.name_target)
pre = Preprocess(path)
data = pre.get_names_targets()
target_vocab = pre.get_targets()
name_vocab = pre.get_vocab()
ds = LoadDataset(data, name_vocab, target_vocab)
for x, y in ds:
print(x, y)
break
5823 0
dls = DataLoader(ds, 16, shuffle=True)
for data, label in dls:
print(data.shape, label.shape)
print(data[0], label[0],'\n', name_vocab[data[0].item()], target_vocab[label[0].item()])
break
torch.Size([16]) torch.Size([16]) tensor(1468) tensor(8) Jigailo Russian
text = "In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth."
import spacy
spacy_nlp = spacy.load('en_core_web_sm')
doc = spacy_nlp(text)
[token.text for token in doc]
text1 = "ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456"
doc1 = spacy_nlp(text1)
[token.text for token in doc1]
['ConcateStringAnd123', 'ConcateSepcialCharacter_!@', '#', '!', '@#$%^&*()_+', '0123456']
text2 = "Let’s go to N.Y.!"
doc2 = spacy_nlp(text2)
[token.text for token in doc2]
['Let', '’s', 'go', 'to', 'N.Y.', '!']
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.word_tokenize(text)
nltk.word_tokenize(text1)
nltk.sent_tokenize(text), nltk.sent_tokenize(text1), nltk.sent_tokenize(text2)
(['In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).', 'A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.', 'A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.'], ['ConcateStringAnd123 ConcateSepcialCharacter_!', '@# !', '@#$%^&*()_+ 0123456'], ['Let’s go to N.Y.!'])
#parts of speech tagging
tokens = nltk.word_tokenize(text)
nltk.pos_tag(tokens)
We can remove stopwords while performing the following tasks:
Text Classification
from nltk.corpus import stopwords
nltk.download('stopwords')
tokens = nltk.word_tokenize(text)
stop_word = set(stopwords.words('english'))
[token for token in tokens if token not in stop_word]
t = "He determined to drop his litigation with the monastry and relinguish his claims to the wood-cuting and \n fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had \n indeed the vaguest idea where the wood and river in question were."
to = nltk.word_tokenize(t)
[token for token in to if token not in stop_word]
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
ps.stem('going'), lemmatizer.lemmatize('going', pos='v') #lemmatizes only the specified parts of speech
('go', 'go')
[ps.stem(token) for token in tokens if token not in stop_word]
lemma = []
for token in tokens:
if token not in stop_word:
word = lemmatizer.lemmatize(token, pos='n')
word = lemmatizer.lemmatize(word, pos='v')
word = lemmatizer.lemmatize(word, pos='a')
lemma.append(word)
lemma
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
[token.lemma_ for token in doc if token.text not in stop_word]