change: updated clustering/MoVM; add: docstring to clustering/MoVM, a…

…dd clustering to docs
circstat · Feb 7, 2025 · 3a8c5ca · 3a8c5ca
1 parent 99f4a5c
commit 3a8c5ca
Show file tree

Hide file tree

Showing 6 changed files with 232 additions and 40 deletions.
diff --git a/docs/docs/feature-checklist.md b/docs/docs/feature-checklist.md
@@ -30,18 +30,18 @@
 
 #### One-Sample Tests for Significance
 
-| Feature                     | H0                        | PyCircStat2         | PyCircStat | CircStat (MATLAB) | CircStats (R) | circular (R)    |
-| --------------------------- | ------------------------- | ------------------- | ---------- | ----------------- | ------------- | --------------- |
-| **Mean Direction**          |                           |                     |            |                   |               |                 |
-| Rayleigh Test               | $\rho=0$                  | `rayleigh_test`     | `rayleigh` | `circ_rtest`      | `r.test`      | `rayleigh.test` |
-| V-Test                      | $\rho=0$                  | `V_test`            | `vtest`    | `circ_vtest`      | `v0.test`     | -               |
-| One-sample Test             | $\tilde\mu=μ_0$           | `one_sample_test`   | `mtest`    | `circ_mtest`      | -             | -               |
-| Change Point Test           | no change point           | `change_point_test` | -          | -                 | `change.pt`   | `change.point`  |
-| **Median Direction**        |                           |                     |            |                   |               |                 |
-| Hodges-Ajne (omnibus) Test  | $\rho=0$                  | `omnibus_test`      | `omnibus`  | `circ_otest`      | -             | -               |
-| Batschelet Test             | $\rho=0$                  | `batschelet_test`   | -          | -                 | -             | -               |
-| Binomial Test               | $\tilde\theta = \theta_0$ | `binomial_test`     | `medtest`  | `circ_medtest`    | -             | -               |
-| Symmetry Test around median | $\text{symmetry}$         | `symmetry_test`     | `symtest`  | `circ_symtest`    | -             | -               |
+| Feature                     | H0                                  | PyCircStat2         | PyCircStat | CircStat (MATLAB) | CircStats (R) | circular (R)    |
+| --------------------------- | ----------------------------------- | ------------------- | ---------- | ----------------- | ------------- | --------------- |
+| **Mean Direction**          |                                     |                     |            |                   |               |                 |
+| Rayleigh Test               | $\rho=0$ [^uniform]                 | `rayleigh_test`     | `rayleigh` | `circ_rtest`      | `r.test`      | `rayleigh.test` |
+| V-Test                      | $\rho=0$                            | `V_test`            | `vtest`    | `circ_vtest`      | `v0.test`     | -               |
+| One-sample Test             | $\tilde\mu=μ_0$                     | `one_sample_test`   | `mtest`    | `circ_mtest`      | -             | -               |
+| Change Point Test           | no change point                     | `change_point_test` | -          | -                 | `change.pt`   | `change.point`  |
+| **Median Direction**        |                                     |                     |            |                   |               |                 |
+| Hodges-Ajne (omnibus) Test  | $\rho=0$                            | `omnibus_test`      | `omnibus`  | `circ_otest`      | -             | -               |
+| Batschelet Test             | $\rho=0$                            | `batschelet_test`   | -          | -                 | -             | -               |
+| Binomial Test               | $\tilde\theta = \theta_0$ [^median] | `binomial_test`     | `medtest`  | `circ_medtest`    | -             | -               |
+| Symmetry Test around median | $\text{symmetry}$                   | `symmetry_test`     | `symtest`  | `circ_symtest`    | -             | -               |
 
 #### Multi-Sample Tests for Significance
 
@@ -57,7 +57,7 @@
 | Concentration Test (F-test)     | $\kappa_1 = \dots = \kappa_n$                 | `concentration_test`   | -                 | `circ_ktest`      | -                 | -                      |
 | Equal Kappa Test                | $\kappa_1 = \dots = \kappa_n$                 | `equal_kappa_test`     | -                 | -                 | -                 | `equal.kappa.test`     |
 | **Distribution Homogeneity**    |                                               |                        |                   |                   |                   |                        |
-| Watson's U2 Test                | $F_1 = F_2$                                   | `watson_u2_test`       | -                 | -                 | `watson.two`      | `watson.two.test`      |
+| Watson's U2 Test                | $F_1 = F_2$ [^F]                              | `watson_u2_test`       | -                 | -                 | `watson.two`      | `watson.two.test`      |
 | Wallraff Test                   | $F_1 = F_2$                                   | `wallraff_test`        | -                 | -                 | -                 | `wallraff.test`        |
 | Wheeler-Watson Test             | $F_1 = F_2$                                   | `wheeler_watson_test`  | -                 | -                 | -                 | `watson.wheeler.test`  |
 | Rao's Tests for Homogeneity     | $F_1 = F_2$                                   | `rao_homogeneity_test` | -                 | -                 | `rao.homogeneity` | `rao.test`             |
@@ -75,8 +75,8 @@
 ### 3. Correlation & Regression
 | Feature                       | PyCircStat2    | PyCircStat | CircStat (MATLAB) | CircStats (R) | circular (R)              |
 | ----------------------------- | -------------- | ---------- | ----------------- | ------------- | ------------------------- |
-| Circular-Circular Correlation | `circ_corrcc`       | `corrcc`   | `circ_corrcc`     | `circ.cor`    | `cor.circular`            |
-| Circular-Linear Correlation   | `circ_corrcl`       | `corrcl`   | `circ_corrcl`     | -             | -                         |
+| Circular-Circular Correlation | `circ_corrcc`  | `corrcc`   | `circ_corrcc`     | `circ.cor`    | `cor.circular`            |
+| Circular-Linear Correlation   | `circ_corrcl`  | `corrcl`   | `circ_corrcl`     | -             | -                         |
 | Circular-Circular Regression  | `CCRegression` | -          | -                 | `circ.reg`    | `lm.circular(type="c-c")` |
 | Circular-Linear Regression    | `CLRegression` | -          | -                 | -             | `lm.circular(type="c-l")` |
 
@@ -168,7 +168,7 @@
 
 [^uniform]: $\rho=0$ stands for uniform distributed.
 [^median]: $\theta$ stands for median.
+[^F]: $F$ stands for distributions.
 [^one-way]: Yet anothr one-way ANOVA.
 [^two-way]: Two-way ANOVA.
-[^F]: $F$ stands for distributions.
 
diff --git a/docs/docs/reference/clustering.md b/docs/docs/reference/clustering.md
@@ -0,0 +1 @@
+::: pycircstat2.clustering
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -29,6 +29,7 @@ nav:
           - Correlation: reference/correlation.md
           - Regression: reference/regression.md
           - Distributions: reference/distributions.md
+          - Clustering: reference/clustering.md
           - Utilities: reference/utils.md
 
 markdown_extensions:

diff --git a/pycircstat2/clustering.py b/pycircstat2/clustering.py
@@ -1,15 +1,55 @@
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
-from scipy.stats import vonmises
 
 from .descriptive import circ_kappa, circ_mean_and_r
+from .distributions import vonmises
 from .utils import data2rad
 
 
 class MoVM:
     """
-    Mixture of von Mises Clustering
+    Mixture of von Mises (MoVM) Clustering.
+
+    This class implements the Expectation-Maximization (EM) algorithm for clustering 
+    circular data using a mixture of von Mises distributions. It is analogous to 
+    Gaussian Mixture Models (GMM) but adapted for directional statistics.
+
+    Parameters
+    ----------
+    burnin : int, default=30
+        Number of initial iterations before checking for convergence.
+    n_clusters : int, default=5
+        The number of von Mises distributions (clusters) to fit.
+    n_iters : int, default=100
+        Maximum number of iterations for the EM algorithm.
+    n_intervals : int, default=360
+        Used for converting degree-based data into radians.
+    unit : {"degree", "radian"}, default="degree"
+        Specifies whether input data is in degrees or radians.
+    random_seed : int, default=2046
+        Random seed for reproducibility.
+    threshold : float, default=1e-16
+        Convergence threshold based on the negative log-likelihood difference.
+
+    Attributes
+    ----------
+    converged : bool
+        Whether the algorithm has converged.
+    nLL : np.ndarray
+        Array of negative log-likelihood values over iterations.
+    m : np.ndarray
+        Cluster means (circular means).
+    r : np.ndarray
+        Cluster mean resultant vectors.
+    p : np.ndarray
+        Cluster probabilities.
+    kappa : np.ndarray
+        Concentration parameters for each von Mises component.
+    gamma : np.ndarray
+        Responsibility matrix (posterior probabilities of clusters for each data point).
+    labels : np.ndarray
+        The most probable cluster assignment for each data point.
     """
 
     def __init__(
@@ -37,25 +77,62 @@ def _initialize(
         self,
         x: np.ndarray,
         n_clusters_init: int,
-    ):
-        n = len(x)  # number of samples
-        p = (
-            np.ones(n_clusters_init) / n_clusters_init
-        )  # initial cluster probability
-        z = np.random.choice(
-            np.arange(n_clusters_init), size=n
-        )  # initial labels
+    ) -> tuple:
+        """
+        Initializes cluster parameters before running the EM algorithm.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            Input circular data in radians.
+        n_clusters_init : int
+            Number of initial clusters.
+
+        Returns
+        -------
+        tuple
+            - m (np.ndarray): Initial cluster means.
+            - kappa (np.ndarray): Initial concentration parameters.
+            - p (np.ndarray): Initial cluster probabilities.
+        """
+        # number of samples
+        n = len(x)  
+
+        # initial cluster probability
+        p = np.ones(n_clusters_init) / n_clusters_init 
+
+        # initial labels
+        z = np.random.choice(np.arange(n_clusters_init), size=n) 
+
+        # initial means and resultant vector lengths
         m, r = map(
             np.array,
             zip(*[circ_mean_and_r(x[z == i]) for i in range(n_clusters_init)]),
-        )  # initial means and resultant vector lengths
-        kappa = np.array(
-            [circ_kappa(r=r[i]) for i in range(n_clusters_init)]
-        )  # initial kappa (without correction by hard-coding a larger enough n)
+        )  
+
+        # initial kappa (without correction by hard-coding a larger enough n)
+        kappa = np.array([circ_kappa(r=r[i]) for i in range(n_clusters_init)])  
 
         return m, kappa, p
 
     def fit(self, x: np.ndarray, verbose: Union[bool, int] = 0):
+        """
+        Fits the mixture of von Mises model to the given data using the EM algorithm.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            Input data points in degrees or radians.
+        verbose : bool or int, default=0
+            If True, prints progress every iteration. If an integer, prints every `verbose` iterations.
+
+        Updates
+        -------
+        - self.m : Fitted cluster means.
+        - self.kappa : Fitted concentration parameters.
+        - self.p : Fitted cluster probabilities.
+        - self.labels : Final cluster assignments.
+        """
         # seed
         np.random.seed(self.random_seed)
 
@@ -83,6 +160,7 @@ def fit(self, x: np.ndarray, verbose: Union[bool, int] = 0):
                 np.sum(gamma_normed, axis=1)
                 / np.sum(gamma_normed, axis=1).sum()
             )
+
             m, r = map(
                 np.array,
                 zip(
@@ -136,20 +214,49 @@ def compute_gamma(
         p: np.ndarray,
         m: np.ndarray,
         kappa: np.ndarray,
-    ):
+    )-> np.ndarray:
+        """
+        Computes posterior probabilities (responsibilities) for each cluster.
+
+        Returns
+        -------
+        np.ndarray
+            Cluster assignment probabilities for each data point.
+        """
         gamma = np.vstack(
             [
-                p[i] * vonmises.pdf(x_rad, kappa=kappa[i], loc=m[i])
+                p[i] * vonmises.pdf(x_rad, kappa=kappa[i], mu=m[i])
                 for i in range(self.n_clusters)
             ]
         )
         return gamma
 
-    def compute_nLL(self, gamma: np.ndarray):
+    def compute_nLL(self, gamma: np.ndarray)-> float:
+        """
+        Computes the negative log-likelihood.
+
+        Parameters
+        ----------
+        gamma : np.ndarray
+            The responsibility matrix (posterior probabilities of clusters for each data point).
+
+        Returns
+        -------
+        float
+            The negative log-likelihood value.
+        """
         nLL = -np.sum(np.log(np.sum(gamma, axis=0) + 1e-16))
         return nLL
 
-    def compute_BIC(self):
+    def compute_BIC(self)-> float:
+        """
+        Computes the Bayesian Information Criterion (BIC) for model selection.
+
+        Returns
+        -------
+        float
+            The computed BIC value.
+        """
         nLL = self.compute_nLL(self.gamma)
         nparams = self.n_clusters * 3 - 1  # n_means + n_kappas + (n_ps - 1)
         bic = 2 * nLL + np.log(self.n) * nparams
@@ -158,10 +265,27 @@ def compute_BIC(self):
 
     def predict_density(
         self,
-        x: np.ndarray = None,
+        x: Optional[np.ndarray] = None,
         unit: Union[str, None] = None,
         n_intervals: Union[float, int, None] = None,
-    ):
+    )-> np.ndarray:
+        """
+        Predicts density estimates for given points.
+
+        Parameters
+        ----------
+        x : np.ndarray, optional
+            Points at which to estimate the density.
+        unit : {"degree", "radian"}, optional
+            Specifies whether input data is in degrees or radians.
+        n_intervals : int, optional
+            Number of intervals for data conversion.
+
+        Returns
+        -------
+        np.ndarray
+            Estimated density at the provided points.
+        """
         unit = self.unit if unit is None else unit
         n_intervals = self.n_intervals if n_intervals is None else n_intervals
 
@@ -171,12 +295,25 @@ def predict_density(
         x_rad = x if unit == "radian" else data2rad(x, n_intervals)
 
         d = [
-            self.p[i] * vonmises.pdf(x_rad, kappa=self.kappa[i], loc=self.m[i])
+            self.p[i] * vonmises.pdf(x_rad, kappa=self.kappa[i], mu=self.m[i])
             for i in range(self.n_clusters)
         ]
         return np.sum(d, axis=0)
 
-    def predict(self, x: np.ndarray):
+    def predict(self, x: np.ndarray)-> np.ndarray:
+        """
+        Predicts cluster assignments for new data.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            New data points in degrees or radians.
+
+        Returns
+        -------
+        np.ndarray
+            Predicted cluster labels.
+        """
         x_rad = x if self.unit == "radian" else data2rad(x, self.n_intervals)
 
         gamma = self.compute_gamma(

diff --git a/pycircstat2/visualization.py b/pycircstat2/visualization.py
@@ -105,8 +105,8 @@ def circ_plot(
 
         # plot density
         if plot_density:  # and not np.isclose(circ_data.r, 0):
-
             kwargs_density = kwargs.pop("kwargs_density", {})
+
             density_method = kwargs_density.pop("method", "nonparametric")
             density_color = kwargs_density.pop("color", "black")
             density_linestyle = kwargs_density.pop("linestyle", "-")

diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pytest
+
+from pycircstat2.clustering import MoVM
+
+
+@pytest.fixture
+def sample_data():
+    """Generate sample circular data following a mixture of von Mises distributions."""
+    np.random.seed(42)
+    x1 = np.random.vonmises(mu=0, kappa=5, size=100)
+    x2 = np.random.vonmises(mu=np.pi, kappa=10, size=100)
+    x = np.concatenate([x1, x2])
+    np.random.shuffle(x)
+    return x
+
+@pytest.fixture
+def movm_instance():
+    """Create a default instance of MoVM for testing."""
+    return MoVM(n_clusters=3, n_iters=50, unit="radian", random_seed=42)
+
+def test_initialization(movm_instance):
+    """Test if the MoVM class initializes with correct parameters."""
+    assert movm_instance.n_clusters == 3
+    assert movm_instance.n_iters == 50
+    assert movm_instance.unit == "radian"
+
+def test_fit_convergence(movm_instance, sample_data):
+    """Test if the algorithm converges within the given iterations."""
+    movm_instance.fit(sample_data, verbose=False)
+    assert movm_instance.converged or len(movm_instance.nLL) == movm_instance.n_iters
+
+def test_fit_cluster_assignment(movm_instance, sample_data):
+    """Ensure that fitted cluster assignments are valid and nontrivial."""
+    movm_instance.fit(sample_data, verbose=False)
+    unique_labels = np.unique(movm_instance.labels)
+    assert len(unique_labels) <= movm_instance.n_clusters  # Some clusters may be empty
+    assert len(unique_labels) > 1  # Should not collapse into a single cluster
+
+def test_predict(movm_instance, sample_data):
+    """Test cluster predictions on input data."""
+    movm_instance.fit(sample_data, verbose=False)
+    predicted_labels = movm_instance.predict(sample_data)
+    assert len(predicted_labels) == len(sample_data)
+    assert predicted_labels.dtype == np.int64
+
+def test_predict_density(movm_instance):
+    """Ensure density prediction returns reasonable values."""
+    movm_instance.fit(np.random.vonmises(mu=0, kappa=5, size=200), verbose=False)
+    x_test = np.linspace(0, 2 * np.pi, 50)
+    density = movm_instance.predict_density(x_test)
+    assert len(density) == len(x_test)
+    assert np.all(density >= 0)  # Probabilities should not be negative