In order to extract numerical feature vectors from a sequence of symbols , scikit-learn provides utilities for the most common ways to extract numerical features:
also have a look at the text feature extraction section in the userguide and the working with text data section
Individual samples are assumed to be files stored a two levels folder structure such as the following:
The load_files function needs following parameters:
from sklearn.datasets import load_files
# corpus-4docs has no categories -> thus use the parent directory as root and restrict it to the 'corpus-4docs' folder
corpus_4_docs = load_files('DataSetEx6', categories=['corpus-4docs'], encoding='utf-8')
#corpus_30_docs = load_files('DataSetEx6/corpus-30docs',encoding='utf-8')
for text in corpus_4_docs.data:
print(text[:30])
print(corpus_4_docs.target)
from sklearn.feature_extraction.text import CountVectorizer
d1 = "Saturn is the gas planet with rings."
d2 = "Jupiter is the largest gas planet."
d3 = "Saturn is the Roman god of sowing."
docs = [d1, d2, d3]
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(docs) #corpus_4_docs.data)
print(count_matrix.toarray())
for feature_name in count_vectorizer.get_feature_names():
print(feature_name)
print(len(count_vectorizer.get_feature_names()))
def get_word_freq(matrix, vectorizer):
'''Function for generating a list of (freq, word)'''
return sorted([(matrix.getcol(idx).sum(), word) for word, idx in vectorizer.vocabulary_.items()], reverse=True)
for tfidf, word in get_word_freq(count_matrix, count_vectorizer)[:40]:
print("{:.3f} {}".format(tfidf, word))
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_matrix = tf_idf_vectorizer.fit_transform(docs) #corpus_4_docs.data)
print(tf_idf_matrix.toarray())
for feature_name in tf_idf_vectorizer.get_feature_names()[:20]:
print(feature_name)
vectorizer_with_stopwords = TfidfVectorizer(stop_words='english')
tf_idf_stop_matrix = vectorizer_with_stopwords.fit_transform(corpus_4_docs.data)
for tfidf, word in get_word_freq(tf_idf_stop_matrix, vectorizer_with_stopwords)[:40]:
print("{:.3f} {}".format(tfidf, word))
from nltk.stem.porter import PorterStemmer # use the stemmer from nltk
import re
class TokenizerWithStemming(object):
def __init__(self):
self.stemmer = PorterStemmer()
self.token_pattern = re.compile(r"(?u)\b\w\w+\b")
def __call__(self, doc):
# tokenize the input with a regex and stem each token
return [self.stemmer.stem(t) for t in self.token_pattern.findall(doc)]
stem_vectorizer = TfidfVectorizer(tokenizer=TokenizerWithStemming())
stem_matrix = stem_vectorizer.fit_transform(corpus_4_docs.data)
for tfidf, word in get_word_freq(stem_matrix, stem_vectorizer)[:40]:
print("{:.3f} {}".format(tfidf, word))
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(stem_matrix)
# One can also use a star to import all pairwise metrics
from sklearn.metrics.pairwise import *
linear_kernel(stem_matrix)
prune_vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3) # Percentual
prune_vectorizer = TfidfVectorizer(min_df=5, max_df=20) # Absolute
corpus_30_docs = load_files('DataSetEx6/corpus-30docs',encoding='utf-8')
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_matrix = tf_idf_vectorizer.fit_transform(corpus_30_docs.data)
from sklearn.cluster import KMeans
k_means_estimator = KMeans(n_clusters = 2)
labels = k_means_estimator.fit_predict(tf_idf_matrix)
print(labels)
print(corpus_30_docs.target)
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
labels_pred_2 = [1, 1, 0, 0, 3, 3]# permute 0 and 1 and rename 2 to 3
print(metrics.adjusted_rand_score(labels_true, labels_pred))
print(metrics.adjusted_rand_score(labels_true, labels_pred_2))
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option('max_rows', 500)
text_zero_indices = [i for i, text in enumerate(corpus_30_docs.data) if corpus_30_docs.target[i] == 0]
text_one_indices = [i for i, text in enumerate(corpus_30_docs.data) if corpus_30_docs.target[i] == 1]
text_two_indices = [i for i, text in enumerate(corpus_30_docs.data) if corpus_30_docs.target[i] == 2]
count_vectorizer = CountVectorizer(tokenizer= TokenizerWithStemming(), stop_words='english')
count_matrix = count_vectorizer.fit_transform(corpus_30_docs.data)
rows = []
for word, idx in count_vectorizer.vocabulary_.items():
rows.append((word,
(count_matrix.getcol(idx) > 0).sum(),
(count_matrix.getcol(idx)[text_zero_indices] > 0).sum(),
(count_matrix.getcol(idx)[text_one_indices] > 0).sum(),
(count_matrix.getcol(idx)[text_two_indices] > 0).sum()))
document_freqs = pd.DataFrame(rows, columns = ['word', 'All', corpus_30_docs.target_names[0], corpus_30_docs.target_names[1], corpus_30_docs.target_names[2]])
document_freqs.sort_values('All', ascending=False)