Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ This ensures that `ninja`, `meson`, and other build tools remain available in yo
| `VarianceReduction` | Variance | Minimizes output variance (requires C extension) |
| `HintSVM` | SVM-based | SVM-guided active learning (requires C extension) |
| `DensityWeightedMeta` | Density | Weights informativeness by density |
| `DiversityWeightedMeta` | Batch Diversity | Diversity-aware batch selection over any score-based strategy |
| `DWUS` | Density + Uncertainty | Density-weighted uncertainty sampling |

## Available Models
Expand All @@ -196,6 +197,34 @@ lb = lbr.label(X[ask_id]) # query the label of unlabeled data from labeler insta
trn_ds.update(ask_id, lb) # update the dataset with newly queried data
```

### Batch querying

Sequential querying retrains the model once per label, which is
impractical for expensive models. Every score-based strategy also
supports querying a batch of distinct samples in one call:

```python
ask_ids = qs.make_query_batch(10) # top-10 by acquisition score
labels = [lbr.label(X[i]) for i in ask_ids]
trn_ds.update_batch(ask_ids, labels) # one call, same per-entry callbacks
```

A plain top-k batch can contain redundant near-duplicate points. Wrap
any base strategy with `DiversityWeightedMeta` to make batches
diversity-aware:

```python
from libact.models import LogisticRegression
from libact.query_strategies import DiversityWeightedMeta, UncertaintySampling

qs = DiversityWeightedMeta(
trn_ds,
base_query_strategy=UncertaintySampling(trn_ds, model=LogisticRegression()),
lmbda=0.5, # 0 = pure top-k, 1 = pure farthest-point
)
ask_ids = qs.make_query_batch(10)
```

### Using CoreSet, BALD, and InformationDensity Strategies

```python
Expand Down Expand Up @@ -254,6 +283,9 @@ Available examples:
with other active learning algorithms.
- [albl_new_strategies_benchmark](examples/albl_new_strategies_benchmark.py): Benchmarks
CoreSet, BALD, and InformationDensity query strategies individually and combined via ALBL.
- [batch_query_plot](examples/batch_query_plot.py): This example compares sequential
active learning with batch-mode querying (`make_query_batch` / `update_batch`),
including diversity-aware batches via `DiversityWeightedMeta`.
- [multilabel_plot](examples/multilabel_plot.py): This example compares the performance of
algorithms under multilabel setting.
- [alce_plot](examples/alce_plot.py): This example compares the performance of
Expand Down
8 changes: 8 additions & 0 deletions docs/libact.query_strategies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ libact.query_strategies.coreset module
:undoc-members:
:show-inheritance:

libact.query_strategies.diversity_weighted_meta module
-------------------------------------------------------

.. automodule:: libact.query_strategies.diversity_weighted_meta
:members:
:undoc-members:
:show-inheritance:

libact.query_strategies.epsilon_uncertainty_sampling module
-----------------------------------------------------------

Expand Down
136 changes: 136 additions & 0 deletions examples/batch_query_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Batch-mode active learning example.

Sequential active learning retrains the model after every single query
(make_query -> label -> update), which is impractical when training is
expensive. Batch querying selects batch_size points per round
(make_query_batch -> label -> update_batch), cutting the number of
training rounds by a factor of batch_size.

This script compares, on the diabetes dataset:

- sequential uncertainty sampling (one query per training round),
- plain top-k batch uncertainty sampling (one round per batch, but the
batch may contain redundant near-duplicate points), and
- DiversityWeightedMeta over the same base strategy (one round per
batch, with diversity-aware batches).
"""

import copy
import os

import numpy as np
import matplotlib.pyplot as plt
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split

# libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import LogisticRegression
from libact.query_strategies import DiversityWeightedMeta, UncertaintySampling
from libact.labelers import IdealLabeler


def run_sequential(trn_ds, tst_ds, lbr, model, qs, quota):
"""One query per round: quota training rounds in total."""
n_labeled_axis, E_out = [], []

for _ in range(quota):
ask_id = qs.make_query()
lb = lbr.label(trn_ds.data[ask_id][0])
trn_ds.update(ask_id, lb)

model.train(trn_ds)
n_labeled_axis.append(trn_ds.len_labeled())
E_out.append(1 - model.score(tst_ds))

return n_labeled_axis, E_out


def run_batch(trn_ds, tst_ds, lbr, model, qs, quota, batch_size):
"""batch_size queries per round: quota/batch_size training rounds."""
n_labeled_axis, E_out = [], []

for _ in range(quota // batch_size):
ask_ids = qs.make_query_batch(batch_size)
labels = [lbr.label(trn_ds.data[ask_id][0]) for ask_id in ask_ids]
trn_ds.update_batch(ask_ids, labels)

model.train(trn_ds)
n_labeled_axis.append(trn_ds.len_labeled())
E_out.append(1 - model.score(tst_ds))

return n_labeled_axis, E_out


def split_train_test(dataset_filepath, test_size, n_labeled):
X, y = import_libsvm_sparse(dataset_filepath).format_sklearn()

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size)
trn_ds = Dataset(X_train, np.concatenate(
[y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test, y_test)
fully_labeled_trn_ds = Dataset(X_train, y_train)

return trn_ds, tst_ds, fully_labeled_trn_ds


def main():
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
test_size = 0.33 # fraction of samples assigned to the test set
n_labeled = 10 # number of samples that are initially labeled
quota = 120 # number of samples to query in total
batch_size = 10 # number of samples per batch query

trn_ds, tst_ds, fully_labeled_trn_ds = \
split_train_test(dataset_filepath, test_size, n_labeled)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
lbr = IdealLabeler(fully_labeled_trn_ds)

# 1) Sequential uncertainty sampling: one training round per label.
qs1 = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
n1, E1 = run_sequential(
trn_ds, tst_ds, lbr, LogisticRegression(), qs1, quota)
print('sequential US : %3d training rounds for %d labels'
% (quota, quota))

# 2) Plain top-k batch: batch_size labels per training round. The
# batch may contain redundant near-duplicates.
qs2 = UncertaintySampling(trn_ds2, method='lc', model=LogisticRegression())
n2, E2 = run_batch(
trn_ds2, tst_ds, lbr, LogisticRegression(), qs2, quota, batch_size)
print('top-k batch US : %3d training rounds for %d labels'
% (quota // batch_size, quota))

# 3) Diversity-aware batches over the same base strategy.
qs3 = DiversityWeightedMeta(
trn_ds3,
base_query_strategy=UncertaintySampling(
trn_ds3, method='lc', model=LogisticRegression()),
lmbda=0.5,
random_state=1126,
)
n3, E3 = run_batch(
trn_ds3, tst_ds, lbr, LogisticRegression(), qs3, quota, batch_size)
print('diversity batch US : %3d training rounds for %d labels'
% (quota // batch_size, quota))

plt.plot(n1, E1, 'g', label='sequential US')
plt.plot(n2, E2, 'b--o', label='top-k batch US')
plt.plot(n3, E3, 'r--s', label='DiversityWeightedMeta batch')
plt.xlabel('Number of labeled samples')
plt.ylabel('Test error')
plt.title('Sequential vs batch-mode active learning')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=3)
plt.show()


if __name__ == '__main__':
main()
51 changes: 51 additions & 0 deletions libact/base/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,57 @@ def update(self, entry_id, new_label):
for callback in self._update_callback:
callback(entry_id, new_label)

def update_batch(self, entry_ids, labels):
"""Update multiple entries with their labels in a single call.

Labels are applied through :py:meth:`update` one entry at a time,
in the given order, so every registered callback observes exactly
the same incremental sequence of ``(entry_id, label)``
notifications as the equivalent series of individual ``update()``
calls. This keeps stateful observers correct (e.g. query
strategies that retrain models or maintain index bookkeeping in
their update hook).

Note that some observers impose assumptions of their own on the
update stream; for instance ActiveLearningByLearning assumes each
update corresponds to an entry it has itself queried via
``make_query()``, and updating other entries is unsupported —
exactly as with individual ``update()`` calls.

Parameters
----------
entry_ids : array-like of int, shape (n_updates,)
Distinct entry ids of the samples to update.

labels : sequence, shape (n_updates,)
Label for each entry. None marks an entry as unlabeled.

Raises
------
ValueError
If entry_ids is not one-dimensional, labels is not a sequence,
their lengths differ, or entry_ids contains duplicate entries.
"""
entry_ids = np.asarray(entry_ids)
if entry_ids.ndim != 1:
raise ValueError(
"entry_ids must be a one-dimensional array-like; for a "
"single entry use update(entry_id, label)")
try:
n_labels = len(labels)
except TypeError:
raise ValueError(
"labels must be a sequence of the same length as entry_ids")
if entry_ids.shape[0] != n_labels:
raise ValueError(
"entry_ids and labels must have the same length, got "
"%d and %d" % (entry_ids.shape[0], n_labels))
if len(np.unique(entry_ids)) != entry_ids.shape[0]:
raise ValueError("entry_ids contains duplicate entries")

for entry_id, label in zip(entry_ids, labels):
self.update(entry_id, label)

def on_update(self, callback):
"""
Add callback function to call when dataset updated.
Expand Down
86 changes: 86 additions & 0 deletions libact/base/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
Base interfaces for use in the package.
The package works according to the interfaces defined below.
"""
import numbers

from six import with_metaclass

from abc import ABCMeta, abstractmethod

import numpy as np


class QueryStrategy(with_metaclass(ABCMeta, object)):

Expand Down Expand Up @@ -61,6 +65,88 @@ def _get_scores(self):
"This is required for batch mode and score-based composition."
)

@staticmethod
def _check_batch_size(batch_size, n_unlabeled):
"""Validate make_query_batch arguments.

Parameters
----------
batch_size : int
The requested batch size.

n_unlabeled : int
Number of unlabeled samples currently in the pool.

Raises
------
TypeError
If batch_size is not an integer (bool is rejected).

ValueError
If batch_size < 1, the pool is empty, or batch_size exceeds the
pool size.
"""
if isinstance(batch_size, bool) or \
not isinstance(batch_size, numbers.Integral):
raise TypeError(
"batch_size must be an integer, got %r" % (batch_size,))
if batch_size < 1:
raise ValueError(
"batch_size must be at least 1, got %d" % batch_size)
if n_unlabeled == 0:
raise ValueError("No unlabeled samples available")
if batch_size > n_unlabeled:
raise ValueError(
"batch_size (%d) exceeds the number of unlabeled samples "
"(%d)" % (batch_size, n_unlabeled))

def make_query_batch(self, batch_size):
"""Return a batch of distinct unlabeled samples to be queried.

The default implementation ranks the unlabeled pool by the
acquisition scores from :py:meth:`_get_scores` and returns the
``batch_size`` highest-scoring entry ids. Strategies override this
method when a faithful batch generalization differs from top-k
(e.g. iterative k-center for CoreSet, sampling without replacement
for RandomSampling).

Unlike :py:meth:`make_query`, ties are broken deterministically
(stable sort, original pool order), so ``make_query_batch(1)`` may
differ from ``make_query()`` for strategies that randomize
tie-breaking.

Parameters
----------
batch_size : int
Number of samples to query. Must satisfy
``1 <= batch_size <= n_unlabeled``. No silent clamping is
performed.

Returns
-------
entry_ids : np.ndarray of int, shape (batch_size,)
Distinct entry ids of the samples to be queried, most preferred
first.

Raises
------
TypeError
If batch_size is not an integer.

ValueError
If batch_size < 1, batch_size exceeds the number of unlabeled
samples, or there are no unlabeled samples.

NotImplementedError
If the strategy does not support per-sample scoring through
:py:meth:`_get_scores`.
"""
entry_ids, scores = self._get_scores()
self._check_batch_size(batch_size, len(entry_ids))

order = np.argsort(-np.asarray(scores, dtype=float), kind='stable')
return np.asarray(entry_ids)[order[:batch_size]]

@abstractmethod
def make_query(self):
"""Return the index of the sample to be queried and labeled. Read-only.
Expand Down
Loading
Loading