import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Set seed for reproducibility
np.random.seed(1)
# Define the number of samples per cluster and total samples
samples_per_cluster = [150, 150, 150]
# Define the means and covariance for each Gaussian distribution
means = [[2, 2], [8, 8], [2, 8]]
covariances = [[[1, 0.5], [0.5, 1]], [[1, -0.3], [-0.3, 1]], [[1, 0], [0, 1]]]
# Generate samples from each distribution
data = []
for mean, cov, n in zip(means, covariances, samples_per_cluster):
cluster_data = np.random.multivariate_normal(mean, cov, n)
data.append(cluster_data)
# Combine the data into a single array
data = np.vstack(data)
# Perform Kernel Density Estimation
kde = gaussian_kde(data.T, bw_method=0.5)
# Generate a grid over the data range
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
x_grid, y_grid = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
grid_coords = np.vstack([x_grid.ravel(), y_grid.ravel()])
# Evaluate density on the grid
density = kde(grid_coords).reshape(100, 100)
# Plot the density as a 2D density plot with contours
plt.figure(figsize=(8, 6))
plt.contourf(x_grid, y_grid, density, cmap='viridis', levels=20)
plt.colorbar(label='Density')
plt.scatter(data[:, 0], data[:, 1], s=5, color='black', alpha=0.6, label='Data Points')
plt.title('2D Density Plot of Gaussian Samples')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.show()
This is solely for explaining the idea of the EM algorithm. The following example heavily relies on the initialization, does not provide an optimal solution, and does not converge.¶
This code implements the Expectation-Maximization (EM) algorithm for clustering data based on height measurements in a class of males and females. The algorithm aims to identify two Gaussian clusters representing male and female heights without prior knowledge of the true cluster labels.
E-Step (Expectation Step): For each data point, the code calculates the likelihood of belonging to each cluster using the current parameters. If the likelihoods are equal, a random assignment is made.
M-Step (Maximization Step): Based on these cluster assignments, the algorithm updates the parameters by recalculating the means, standard deviations, and weights.
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# Set seed for reproducibility
np.random.seed(1)
# Generate height data
male_heights = np.random.normal(180, 5, 30) # Males: Mean = 175, SD = 5
female_heights = np.random.normal(160, 3, 70) # Females: Mean = 165, SD = 3
data = np.hstack([male_heights, female_heights])
# Initial parameter guesses for EM algorithm
mean1, mean2 = np.random.choice(data, 2)
sd1, sd2 = 1, 1 # Initial standard deviations
weight1, weight2 = 0.5, 0.5 # Initial mixing weights
# Number of iterations
n_iterations = 4
# EM Algorithm with soft assignments
for iteration in range(n_iterations):
# E-Step: Compute density and assign each data point to the closest cluster
likelihood1 = weight1 * norm.pdf(data, mean1, sd1)
likelihood2 = weight2 * norm.pdf(data, mean2, sd2)
# Assign each student to the cluster with the greater density
cluster_assignments = np.where(likelihood1 > likelihood2, 1, 2)
# If likelihoods are equal, assign randomly
equal_likelihoods = likelihood1 == likelihood2
random_assignments = np.random.choice([1, 2], size=equal_likelihoods.sum())
cluster_assignments[equal_likelihoods] = random_assignments
# M-Step: Update parameters based on current cluster assignments
cluster1_data = data[cluster_assignments == 1]
cluster2_data = data[cluster_assignments == 2]
weight1 = len(cluster1_data) / len(data)
weight2 = len(cluster2_data) / len(data)
mean1 = np.mean(cluster1_data) if len(cluster1_data) > 0 else mean1
mean2 = np.mean(cluster2_data) if len(cluster2_data) > 0 else mean2
sd1 = np.std(cluster1_data, ddof=1) if len(cluster1_data) > 1 else sd1
sd2 = np.std(cluster2_data, ddof=1) if len(cluster2_data) > 1 else sd2
# Print current parameter estimates
print(f"Iteration {iteration + 1}:")
print(f" Cluster 1: Mean = {mean1:.2f}, SD = {sd1:.2f}, Weight = {weight1:.2f}")
print(f" Cluster 2: Mean = {mean2:.2f}, SD = {sd2:.2f}, Weight = {weight2:.2f}")
Iteration 1: Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70 Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30 Iteration 2: Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70 Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30 Iteration 3: Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70 Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30 Iteration 4: Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70 Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
# Set seed for reproducibility
np.random.seed(3)
# Define the number of samples per cluster and total samples
samples_per_cluster = [150, 200, 150]
# Define the means and covariance for each Gaussian distribution
means = [[2, 2], [8, 8], [2, 8]]
covariances = [[[1, 0.5], [0.5, 1]], [[1, -0.3], [-0.3, 1]], [[1, 0], [0, 1]]]
# Generate samples from each distribution
data = []
for mean, cov, n in zip(means, covariances, samples_per_cluster):
cluster_data = np.random.multivariate_normal(mean, cov, n)
data.append(cluster_data)
# Combine the data into a single array
data = np.vstack(data)
# Test different numbers of components (clusters) in the GMM
component_range = range(1, 11)
log_likelihoods = []
# Fit a GMM for each number of components and record the log-likelihood
for n_components in component_range:
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
gmm.fit(data)
log_likelihoods.append(gmm.score(data) * data.shape[0]) # Total log-likelihood
# Plot the negative log-likelihoods for each number of components
plt.figure(figsize=(8, 6))
plt.plot(component_range, log_likelihoods, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Log-Likelihood')
plt.title('Log-Likelihood for Different Number of Components in GMM')
plt.show()
Model Selection¶
In selecting the optimal number of components for a Gaussian Mixture Model (GMM), the Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC) are widely used metrics that balance model fit with complexity. Both criteria are derived from the likelihood function, but they impose different penalties for the number of parameters, helping to prevent overfitting. AIC aims to find a model that fits the data well while introducing only as many parameters as necessary, using a lighter penalty than BIC. BIC, on the other hand, applies a stronger penalty for complexity, favoring simpler models when the data size is large. Lower AIC or BIC values indicate better models, with BIC generally favoring fewer components than AIC due to its more conservative approach. By comparing these values across models with different numbers of components, we can select the GMM structure that best balances accuracy and simplicity, ensuring the model adequately captures data patterns without unnecessary complexity.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
# Generate data
means = [[0, 0], [10, 10], [0, 10]]
covariances = [[[1, 0.2], [0.2, 1]], [[1, -0.2], [-0.2, 1]], [[1, 0], [0, 1]]]
samples_per_cluster = [150, 200, 150]
data = np.vstack([
np.random.multivariate_normal(mean, cov, n)
for mean, cov, n in zip(means, covariances, samples_per_cluster)
])
# Model selection with BIC, AIC, and Silhouette Score
max_clusters = 5
n_init = 10
best_bic = np.inf
best_aic = np.inf
best_model = None
best_num_clusters = 0
best_labels = None
bic_scores = []
aic_scores = []
silhouette_scores = []
for n_clusters in range(2, max_clusters + 1):
best_bic_for_n = np.inf
best_aic_for_n = np.inf
best_labels_for_n = None
best_silhouette_for_n = -1
for _ in range(n_init):
gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=np.random.randint(100))
gmm.fit(data)
labels = gmm.predict(data)
bic = gmm.bic(data)
aic = gmm.aic(data)
# Track the best BIC/AIC and Silhouette Score across initializations
if bic < best_bic_for_n:
best_bic_for_n = bic
if aic < best_aic_for_n:
best_aic_for_n = aic
# Calculate silhouette score only if there are at least 2 clusters
if n_clusters > 1:
silhouette_avg = silhouette_score(data, labels)
if silhouette_avg > best_silhouette_for_n:
best_silhouette_for_n = silhouette_avg
bic_scores.append((n_clusters, best_bic_for_n))
aic_scores.append((n_clusters, best_aic_for_n))
silhouette_scores.append((n_clusters, best_silhouette_for_n))
# Plot BIC, AIC, and Silhouette Scores
fig, ax = plt.subplots(1, 3, figsize=(18, 5))
ax[0].plot([x[0] for x in bic_scores], [x[1] for x in bic_scores], marker='o', label="BIC")
ax[0].set_title("BIC Scores")
ax[0].set_xlabel("Number of Clusters")
ax[0].set_ylabel("BIC")
ax[1].plot([x[0] for x in aic_scores], [x[1] for x in aic_scores], marker='o', label="AIC", color='orange')
ax[1].set_title("AIC Scores")
ax[1].set_xlabel("Number of Clusters")
ax[1].set_ylabel("AIC")
ax[2].plot([x[0] for x in silhouette_scores], [x[1] for x in silhouette_scores], marker='o', label="Silhouette Score", color='green')
ax[2].set_title("Silhouette Scores")
ax[2].set_xlabel("Number of Clusters")
ax[2].set_ylabel("Silhouette Score")
plt.show()
LDA¶
This code demonstrates how to perform topic modeling on a text dataset using Latent Dirichlet Allocation (LDA) with scikit-learn. First, it loads a subset of the 20 Newsgroups dataset, focusing on three categories, to serve as an example for topic discovery. The documents are then converted into a term-frequency matrix using CountVectorizer, with settings to exclude overly common and rare words, as well as English stop words. An LDA model with three topics is fit to this term-frequency data, which represents each document as a mixture of topics. After training, the code extracts and displays the top 10 words for each topic, providing insight into the main themes of each discovered topic. Finally, the code computes the topic distribution for each document, displaying the topic mix for the first five documents, thus illustrating how each document’s content is represented as a blend of different topics. This approach highlights the LDA model’s ability to capture the latent structure and thematic diversity within a collection of text documents.
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['sci.space', 'rec.sport.baseball', 'talk.politics.mideast'])
documents = newsgroups.data
# Convert text data into a term-frequency matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(documents)
# Fit the LDA model
n_topics = 3 # Number of topics we want to discover
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)
# Get the feature names (words)
words = vectorizer.get_feature_names_out()
# Display the top words for each topic
def display_topics(model, feature_names, num_top_words):
for topic_idx, topic in enumerate(model.components_):
print(f"Topic {topic_idx + 1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[-num_top_words:]]))
print("\n")
# Show top 10 words for each topic
display_topics(lda, words, 10)
# Transform the documents to get topic distribution for each document
topic_distribution = lda.transform(X)
# Display the first 5 documents' topic distributions
print("Topic distribution for the first 5 documents:")
print(pd.DataFrame(np.round(topic_distribution[:5], 2), columns=[f"Topic {i+1}" for i in range(n_topics)]))
print("++++++++++++++++++++++++++++++++++++++++++")
# Access the topic-word distribution
topic_word_distribution = lda.components_
# Find the most likely topic for each word
most_likely_topic_for_word = topic_word_distribution.argmax(axis=0)
# Create a DataFrame to display each word and its most likely topic
word_topic_df = pd.DataFrame({
'Word': words,
'Most_Likely_Topic': most_likely_topic_for_word
})
# Display the top 20 words with their most likely topic
print(word_topic_df.head(20))
Topic 1: just israeli jews people com article organization writes israel edu Topic 2: host don posting game year article writes com organization edu Topic 3: time turks armenia nasa said people armenians turkish armenian space Topic distribution for the first 5 documents: Topic 1 Topic 2 Topic 3 0 0.00 1.00 0.00 1 1.00 0.00 0.00 2 0.36 0.05 0.58 3 0.00 0.69 0.31 4 0.00 0.99 0.00 ++++++++++++++++++++++++++++++++++++++++++ Word Most_Likely_Topic 0 00 2 1 000 2 2 0000 2 3 00000 2 4 000021 0 5 000050 0 6 00041032 2 7 000413 0 8 0004244402 2 9 0004422 2 10 0005 1 11 000th 1 12 001 1 13 001555 0 14 001718 0 15 002118 0 16 002214 2 17 002251w 1 18 0029 2 19 003015 1