Note
Click here to download the full example code
Semi-supervised Classification on a Text Dataset¶
In this example, semi-supervised classifiers are trained on the 20 newsgroups dataset (which will be automatically downloaded).
You can adjust the number of categories by giving their names to the dataset
loader or setting them to None
to get all 20 of them.
Traceback (most recent call last):
File "/build/scikit-learn-HBxYkq/scikit-learn-1.0.2/examples/semi_supervised/plot_semi_supervised_newsgroups.py", line 29, in <module>
data = fetch_20newsgroups(
File "/build/scikit-learn-HBxYkq/scikit-learn-1.0.2/.pybuild/cpython3_3.9/build/sklearn/datasets/_twenty_newsgroups.py", line 264, in fetch_20newsgroups
cache = _download_20newsgroups(
File "/build/scikit-learn-HBxYkq/scikit-learn-1.0.2/.pybuild/cpython3_3.9/build/sklearn/datasets/_twenty_newsgroups.py", line 74, in _download_20newsgroups
archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
File "/build/scikit-learn-HBxYkq/scikit-learn-1.0.2/.pybuild/cpython3_3.9/build/sklearn/datasets/_base.py", line 1454, in _fetch_remote
urlretrieve(remote.url, file_path)
File "/usr/lib/python3.9/urllib/request.py", line 239, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/usr/lib/python3.9/urllib/request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.9/urllib/request.py", line 517, in open
response = self._open(req, data)
File "/usr/lib/python3.9/urllib/request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/lib/python3.9/urllib/request.py", line 494, in _call_chain
result = func(*args)
File "/usr/lib/python3.9/urllib/request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/lib/python3.9/urllib/request.py", line 1349, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -2] Name or service not known>
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score
# Loading dataset containing first five categories
data = fetch_20newsgroups(
subset="train",
categories=[
"alt.atheism",
"comp.graphics",
"comp.os.ms-windows.misc",
"comp.sys.ibm.pc.hardware",
"comp.sys.mac.hardware",
],
)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()
# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)
# Supervised Pipeline
pipeline = Pipeline(
[
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
("clf", SGDClassifier(**sdg_params)),
]
)
# SelfTraining Pipeline
st_pipeline = Pipeline(
[
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
]
)
# LabelSpreading Pipeline
ls_pipeline = Pipeline(
[
("vect", CountVectorizer(**vectorizer_params)),
("tfidf", TfidfTransformer()),
# LabelSpreading does not support dense matrices
("todense", FunctionTransformer(lambda x: x.todense())),
("clf", LabelSpreading()),
]
)
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
print("Number of training samples:", len(X_train))
print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(
"Micro-averaged F1 score on test set: %0.3f"
% f1_score(y_test, y_pred, average="micro")
)
print("-" * 10)
print()
if __name__ == "__main__":
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
print("Supervised SGDClassifier on 100% of the data:")
eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
# select a mask of 20% of the train dataset
y_mask = np.random.rand(len(y_train)) < 0.2
# X_20 and y_20 are the subset of the train dataset indicated by the mask
X_20, y_20 = map(
list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m))
)
print("Supervised SGDClassifier on 20% of the training data:")
eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
# set the non-masked subset to be unlabeled
y_train[~y_mask] = -1
print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):")
eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
print("LabelSpreading on 20% of the data (rest is unlabeled):")
eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
Total running time of the script: ( 0 minutes 0.005 seconds)