ntucllab · CoolJosh0221 · Jun 5, 2026 · Jul 4, 2026
diff --git a/README.md b/README.md
@@ -172,6 +172,7 @@ This ensures that `ninja`, `meson`, and other build tools remain available in yo
 | `VarianceReduction` | Variance | Minimizes output variance (requires C extension) |
 | `HintSVM` | SVM-based | SVM-guided active learning (requires C extension) |
 | `DensityWeightedMeta` | Density | Weights informativeness by density |
+| `DiversityWeightedMeta` | Batch Diversity | Diversity-aware batch selection over any score-based strategy |
 | `DWUS` | Density + Uncertainty | Density-weighted uncertainty sampling |
 
 ## Available Models
@@ -196,6 +197,34 @@ lb = lbr.label(X[ask_id]) # query the label of unlabeled data from labeler insta
 trn_ds.update(ask_id, lb) # update the dataset with newly queried data
 ```
 
+### Batch querying
+
+Sequential querying retrains the model once per label, which is
+impractical for expensive models. Every score-based strategy also
+supports querying a batch of distinct samples in one call:
+
+```python
+ask_ids = qs.make_query_batch(10)         # top-10 by acquisition score
+labels = [lbr.label(X[i]) for i in ask_ids]
+trn_ds.update_batch(ask_ids, labels)      # one call, same per-entry callbacks
+```
+
+A plain top-k batch can contain redundant near-duplicate points. Wrap
+any base strategy with `DiversityWeightedMeta` to make batches
+diversity-aware:
+
+```python
+from libact.models import LogisticRegression
+from libact.query_strategies import DiversityWeightedMeta, UncertaintySampling
+
+qs = DiversityWeightedMeta(
+    trn_ds,
+    base_query_strategy=UncertaintySampling(trn_ds, model=LogisticRegression()),
+    lmbda=0.5,  # 0 = pure top-k, 1 = pure farthest-point
+)
+ask_ids = qs.make_query_batch(10)
+```
+
 ### Using CoreSet, BALD, and InformationDensity Strategies
 
 ```python
@@ -254,6 +283,9 @@ Available examples:
     with other active learning algorithms.
   - [albl_new_strategies_benchmark](examples/albl_new_strategies_benchmark.py): Benchmarks
     CoreSet, BALD, and InformationDensity query strategies individually and combined via ALBL.
+  - [batch_query_plot](examples/batch_query_plot.py): This example compares sequential
+    active learning with batch-mode querying (`make_query_batch` / `update_batch`),
+    including diversity-aware batches via `DiversityWeightedMeta`.
   - [multilabel_plot](examples/multilabel_plot.py): This example compares the performance of
     algorithms under multilabel setting.
   - [alce_plot](examples/alce_plot.py): This example compares the performance of

diff --git a/docs/libact.query_strategies.rst b/docs/libact.query_strategies.rst
@@ -83,6 +83,14 @@ libact.query_strategies.coreset module
     :undoc-members:
     :show-inheritance:
 
+libact.query_strategies.diversity_weighted_meta module
+-------------------------------------------------------
+
+.. automodule:: libact.query_strategies.diversity_weighted_meta
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 libact.query_strategies.epsilon_uncertainty_sampling module
 -----------------------------------------------------------
 

diff --git a/examples/batch_query_plot.py b/examples/batch_query_plot.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Batch-mode active learning example.
+
+Sequential active learning retrains the model after every single query
+(make_query -> label -> update), which is impractical when training is
+expensive. Batch querying selects batch_size points per round
+(make_query_batch -> label -> update_batch), cutting the number of
+training rounds by a factor of batch_size.
+
+This script compares, on the diabetes dataset:
+
+- sequential uncertainty sampling (one query per training round),
+- plain top-k batch uncertainty sampling (one round per batch, but the
+  batch may contain redundant near-duplicate points), and
+- DiversityWeightedMeta over the same base strategy (one round per
+  batch, with diversity-aware batches).
+"""
+
+import copy
+import os
+
+import numpy as np
+import matplotlib.pyplot as plt
+try:
+    from sklearn.model_selection import train_test_split
+except ImportError:
+    from sklearn.cross_validation import train_test_split
+
+# libact classes
+from libact.base.dataset import Dataset, import_libsvm_sparse
+from libact.models import LogisticRegression
+from libact.query_strategies import DiversityWeightedMeta, UncertaintySampling
+from libact.labelers import IdealLabeler
+
+
+def run_sequential(trn_ds, tst_ds, lbr, model, qs, quota):
+    """One query per round: quota training rounds in total."""
+    n_labeled_axis, E_out = [], []
+
+    for _ in range(quota):
+        ask_id = qs.make_query()
+        lb = lbr.label(trn_ds.data[ask_id][0])
+        trn_ds.update(ask_id, lb)
+
+        model.train(trn_ds)
+        n_labeled_axis.append(trn_ds.len_labeled())
+        E_out.append(1 - model.score(tst_ds))
+
+    return n_labeled_axis, E_out
+
+
+def run_batch(trn_ds, tst_ds, lbr, model, qs, quota, batch_size):
+    """batch_size queries per round: quota/batch_size training rounds."""
+    n_labeled_axis, E_out = [], []
+
+    for _ in range(quota // batch_size):
+        ask_ids = qs.make_query_batch(batch_size)
+        labels = [lbr.label(trn_ds.data[ask_id][0]) for ask_id in ask_ids]
+        trn_ds.update_batch(ask_ids, labels)
+
+        model.train(trn_ds)
+        n_labeled_axis.append(trn_ds.len_labeled())
+        E_out.append(1 - model.score(tst_ds))
+
+    return n_labeled_axis, E_out
+
+
+def split_train_test(dataset_filepath, test_size, n_labeled):
+    X, y = import_libsvm_sparse(dataset_filepath).format_sklearn()
+
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=test_size)
+    trn_ds = Dataset(X_train, np.concatenate(
+        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
+    tst_ds = Dataset(X_test, y_test)
+    fully_labeled_trn_ds = Dataset(X_train, y_train)
+
+    return trn_ds, tst_ds, fully_labeled_trn_ds
+
+
+def main():
+    dataset_filepath = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
+    test_size = 0.33    # fraction of samples assigned to the test set
+    n_labeled = 10      # number of samples that are initially labeled
+    quota = 120         # number of samples to query in total
+    batch_size = 10     # number of samples per batch query
+
+    trn_ds, tst_ds, fully_labeled_trn_ds = \
+        split_train_test(dataset_filepath, test_size, n_labeled)
+    trn_ds2 = copy.deepcopy(trn_ds)
+    trn_ds3 = copy.deepcopy(trn_ds)
+    lbr = IdealLabeler(fully_labeled_trn_ds)
+
+    # 1) Sequential uncertainty sampling: one training round per label.
+    qs1 = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
+    n1, E1 = run_sequential(
+        trn_ds, tst_ds, lbr, LogisticRegression(), qs1, quota)
+    print('sequential US      : %3d training rounds for %d labels'
+          % (quota, quota))
+
+    # 2) Plain top-k batch: batch_size labels per training round. The
+    #    batch may contain redundant near-duplicates.
+    qs2 = UncertaintySampling(trn_ds2, method='lc', model=LogisticRegression())
+    n2, E2 = run_batch(
+        trn_ds2, tst_ds, lbr, LogisticRegression(), qs2, quota, batch_size)
+    print('top-k batch US     : %3d training rounds for %d labels'
+          % (quota // batch_size, quota))
+
+    # 3) Diversity-aware batches over the same base strategy.
+    qs3 = DiversityWeightedMeta(
+        trn_ds3,
+        base_query_strategy=UncertaintySampling(
+            trn_ds3, method='lc', model=LogisticRegression()),
+        lmbda=0.5,
+        random_state=1126,
+    )
+    n3, E3 = run_batch(
+        trn_ds3, tst_ds, lbr, LogisticRegression(), qs3, quota, batch_size)
+    print('diversity batch US : %3d training rounds for %d labels'
+          % (quota // batch_size, quota))
+
+    plt.plot(n1, E1, 'g', label='sequential US')
+    plt.plot(n2, E2, 'b--o', label='top-k batch US')
+    plt.plot(n3, E3, 'r--s', label='DiversityWeightedMeta batch')
+    plt.xlabel('Number of labeled samples')
+    plt.ylabel('Test error')
+    plt.title('Sequential vs batch-mode active learning')
+    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
+               fancybox=True, shadow=True, ncol=3)
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/libact/base/dataset.py b/libact/base/dataset.py
@@ -147,6 +147,57 @@ def update(self, entry_id, new_label):
         for callback in self._update_callback:
             callback(entry_id, new_label)
 
+    def update_batch(self, entry_ids, labels):
+        """Update multiple entries with their labels in a single call.
+
+        Labels are applied through :py:meth:`update` one entry at a time,
+        in the given order, so every registered callback observes exactly
+        the same incremental sequence of ``(entry_id, label)``
+        notifications as the equivalent series of individual ``update()``
+        calls. This keeps stateful observers correct (e.g. query
+        strategies that retrain models or maintain index bookkeeping in
+        their update hook).
+
+        Note that some observers impose assumptions of their own on the
+        update stream; for instance ActiveLearningByLearning assumes each
+        update corresponds to an entry it has itself queried via
+        ``make_query()``, and updating other entries is unsupported —
+        exactly as with individual ``update()`` calls.
+
+        Parameters
+        ----------
+        entry_ids : array-like of int, shape (n_updates,)
+            Distinct entry ids of the samples to update.
+
+        labels : sequence, shape (n_updates,)
+            Label for each entry. None marks an entry as unlabeled.
+
+        Raises
+        ------
+        ValueError
+            If entry_ids is not one-dimensional, labels is not a sequence,
+            their lengths differ, or entry_ids contains duplicate entries.
+        """
+        entry_ids = np.asarray(entry_ids)
+        if entry_ids.ndim != 1:
+            raise ValueError(
+                "entry_ids must be a one-dimensional array-like; for a "
+                "single entry use update(entry_id, label)")
+        try:
+            n_labels = len(labels)
+        except TypeError:
+            raise ValueError(
+                "labels must be a sequence of the same length as entry_ids")
+        if entry_ids.shape[0] != n_labels:
+            raise ValueError(
+                "entry_ids and labels must have the same length, got "
+                "%d and %d" % (entry_ids.shape[0], n_labels))
+        if len(np.unique(entry_ids)) != entry_ids.shape[0]:
+            raise ValueError("entry_ids contains duplicate entries")
+
+        for entry_id, label in zip(entry_ids, labels):
+            self.update(entry_id, label)
+
     def on_update(self, callback):
         """
         Add callback function to call when dataset updated.

diff --git a/libact/base/interfaces.py b/libact/base/interfaces.py
@@ -2,10 +2,14 @@
 Base interfaces for use in the package.
 The package works according to the interfaces defined below.
 """
+import numbers
+
 from six import with_metaclass
 
 from abc import ABCMeta, abstractmethod
 
+import numpy as np
+
 
 class QueryStrategy(with_metaclass(ABCMeta, object)):
 
@@ -61,6 +65,88 @@ def _get_scores(self):
             "This is required for batch mode and score-based composition."
         )
 
+    @staticmethod
+    def _check_batch_size(batch_size, n_unlabeled):
+        """Validate make_query_batch arguments.
+
+        Parameters
+        ----------
+        batch_size : int
+            The requested batch size.
+
+        n_unlabeled : int
+            Number of unlabeled samples currently in the pool.
+
+        Raises
+        ------
+        TypeError
+            If batch_size is not an integer (bool is rejected).
+
+        ValueError
+            If batch_size < 1, the pool is empty, or batch_size exceeds the
+            pool size.
+        """
+        if isinstance(batch_size, bool) or \
+                not isinstance(batch_size, numbers.Integral):
+            raise TypeError(
+                "batch_size must be an integer, got %r" % (batch_size,))
+        if batch_size < 1:
+            raise ValueError(
+                "batch_size must be at least 1, got %d" % batch_size)
+        if n_unlabeled == 0:
+            raise ValueError("No unlabeled samples available")
+        if batch_size > n_unlabeled:
+            raise ValueError(
+                "batch_size (%d) exceeds the number of unlabeled samples "
+                "(%d)" % (batch_size, n_unlabeled))
+
+    def make_query_batch(self, batch_size):
+        """Return a batch of distinct unlabeled samples to be queried.
+
+        The default implementation ranks the unlabeled pool by the
+        acquisition scores from :py:meth:`_get_scores` and returns the
+        ``batch_size`` highest-scoring entry ids. Strategies override this
+        method when a faithful batch generalization differs from top-k
+        (e.g. iterative k-center for CoreSet, sampling without replacement
+        for RandomSampling).
+
+        Unlike :py:meth:`make_query`, ties are broken deterministically
+        (stable sort, original pool order), so ``make_query_batch(1)`` may
+        differ from ``make_query()`` for strategies that randomize
+        tie-breaking.
+
+        Parameters
+        ----------
+        batch_size : int
+            Number of samples to query. Must satisfy
+            ``1 <= batch_size <= n_unlabeled``. No silent clamping is
+            performed.
+
+        Returns
+        -------
+        entry_ids : np.ndarray of int, shape (batch_size,)
+            Distinct entry ids of the samples to be queried, most preferred
+            first.
+
+        Raises
+        ------
+        TypeError
+            If batch_size is not an integer.
+
+        ValueError
+            If batch_size < 1, batch_size exceeds the number of unlabeled
+            samples, or there are no unlabeled samples.
+
+        NotImplementedError
+            If the strategy does not support per-sample scoring through
+            :py:meth:`_get_scores`.
+        """
+        entry_ids, scores = self._get_scores()
+        self._check_batch_size(batch_size, len(entry_ids))
+
+        order = np.argsort(-np.asarray(scores, dtype=float), kind='stable')
+        return np.asarray(entry_ids)[order[:batch_size]]
+
     @abstractmethod
     def make_query(self):
         """Return the index of the sample to be queried and labeled. Read-only.