import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Set seed for reproducibility
np.random.seed(1)

# Define the number of samples per cluster and total samples
samples_per_cluster = [150, 150, 150]

# Define the means and covariance for each Gaussian distribution
means = [[2, 2], [8, 8], [2, 8]]
covariances = [[[1, 0.5], [0.5, 1]], [[1, -0.3], [-0.3, 1]], [[1, 0], [0, 1]]]

# Generate samples from each distribution
data = []
for mean, cov, n in zip(means, covariances, samples_per_cluster):
    cluster_data = np.random.multivariate_normal(mean, cov, n)
    data.append(cluster_data)

# Combine the data into a single array
data = np.vstack(data)

# Perform Kernel Density Estimation
kde = gaussian_kde(data.T, bw_method=0.5)

# Generate a grid over the data range
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
x_grid, y_grid = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
grid_coords = np.vstack([x_grid.ravel(), y_grid.ravel()])

# Evaluate density on the grid
density = kde(grid_coords).reshape(100, 100)

# Plot the density as a 2D density plot with contours
plt.figure(figsize=(8, 6))
plt.contourf(x_grid, y_grid, density, cmap='viridis', levels=20)
plt.colorbar(label='Density')
plt.scatter(data[:, 0], data[:, 1], s=5, color='black', alpha=0.6, label='Data Points')
plt.title('2D Density Plot of Gaussian Samples')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# Set seed for reproducibility
np.random.seed(1)

# Generate height data
male_heights = np.random.normal(180, 5, 30)  # Males: Mean = 175, SD = 5
female_heights = np.random.normal(160, 3, 70)  # Females: Mean = 165, SD = 3
data = np.hstack([male_heights, female_heights])

# Initial parameter guesses for EM algorithm
mean1, mean2 = np.random.choice(data, 2)
sd1, sd2 = 1, 1     # Initial standard deviations
weight1, weight2 = 0.5, 0.5               # Initial mixing weights

# Number of iterations
n_iterations = 4

# EM Algorithm with soft assignments
for iteration in range(n_iterations):
    # E-Step: Compute density and assign each data point to the closest cluster
    likelihood1 = weight1 * norm.pdf(data, mean1, sd1)
    likelihood2 = weight2 * norm.pdf(data, mean2, sd2)
    
    # Assign each student to the cluster with the greater density
    cluster_assignments = np.where(likelihood1 > likelihood2, 1, 2)
    
    # If likelihoods are equal, assign randomly
    equal_likelihoods = likelihood1 == likelihood2
    random_assignments = np.random.choice([1, 2], size=equal_likelihoods.sum())
    cluster_assignments[equal_likelihoods] = random_assignments
    
    # M-Step: Update parameters based on current cluster assignments
    cluster1_data = data[cluster_assignments == 1]
    cluster2_data = data[cluster_assignments == 2]
    
    weight1 = len(cluster1_data) / len(data)
    weight2 = len(cluster2_data) / len(data)
    
    mean1 = np.mean(cluster1_data) if len(cluster1_data) > 0 else mean1
    mean2 = np.mean(cluster2_data) if len(cluster2_data) > 0 else mean2
    
    sd1 = np.std(cluster1_data, ddof=1) if len(cluster1_data) > 1 else sd1
    sd2 = np.std(cluster2_data, ddof=1) if len(cluster2_data) > 1 else sd2
    
    # Print current parameter estimates
    print(f"Iteration {iteration + 1}:")
    print(f"  Cluster 1: Mean = {mean1:.2f}, SD = {sd1:.2f}, Weight = {weight1:.2f}")
    print(f"  Cluster 2: Mean = {mean2:.2f}, SD = {sd2:.2f}, Weight = {weight2:.2f}")

Iteration 1:
  Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70
  Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30
Iteration 2:
  Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70
  Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30
Iteration 3:
  Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70
  Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30
Iteration 4:
  Cluster 1: Mean = 160.34, SD = 2.48, Weight = 0.70
  Cluster 2: Mean = 179.70, SD = 5.13, Weight = 0.30

import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

# Set seed for reproducibility
np.random.seed(3)

# Define the number of samples per cluster and total samples
samples_per_cluster = [150, 200, 150]

# Define the means and covariance for each Gaussian distribution
means = [[2, 2], [8, 8], [2, 8]]
covariances = [[[1, 0.5], [0.5, 1]], [[1, -0.3], [-0.3, 1]], [[1, 0], [0, 1]]]

# Generate samples from each distribution
data = []
for mean, cov, n in zip(means, covariances, samples_per_cluster):
    cluster_data = np.random.multivariate_normal(mean, cov, n)
    data.append(cluster_data)

# Combine the data into a single array
data = np.vstack(data)

# Test different numbers of components (clusters) in the GMM
component_range = range(1, 11)
log_likelihoods = []

# Fit a GMM for each number of components and record the log-likelihood
for n_components in component_range:
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
    gmm.fit(data)
    log_likelihoods.append(gmm.score(data) * data.shape[0])  # Total log-likelihood

# Plot the negative log-likelihoods for each number of components
plt.figure(figsize=(8, 6))
plt.plot(component_range, log_likelihoods, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Log-Likelihood')
plt.title('Log-Likelihood for Different Number of Components in GMM')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Generate data
means = [[0, 0], [10, 10], [0, 10]]
covariances = [[[1, 0.2], [0.2, 1]], [[1, -0.2], [-0.2, 1]], [[1, 0], [0, 1]]]
samples_per_cluster = [150, 200, 150]
data = np.vstack([
    np.random.multivariate_normal(mean, cov, n)
    for mean, cov, n in zip(means, covariances, samples_per_cluster)
])

# Model selection with BIC, AIC, and Silhouette Score
max_clusters = 5
n_init = 10
best_bic = np.inf
best_aic = np.inf
best_model = None
best_num_clusters = 0
best_labels = None

bic_scores = []
aic_scores = []
silhouette_scores = []

for n_clusters in range(2, max_clusters + 1):
    best_bic_for_n = np.inf
    best_aic_for_n = np.inf
    best_labels_for_n = None
    best_silhouette_for_n = -1
    for _ in range(n_init):
        gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=np.random.randint(100))
        gmm.fit(data)
        labels = gmm.predict(data)
        bic = gmm.bic(data)
        aic = gmm.aic(data)
        
        # Track the best BIC/AIC and Silhouette Score across initializations
        if bic < best_bic_for_n:
            best_bic_for_n = bic
        if aic < best_aic_for_n:
            best_aic_for_n = aic
        
        # Calculate silhouette score only if there are at least 2 clusters
        if n_clusters > 1:
            silhouette_avg = silhouette_score(data, labels)
            if silhouette_avg > best_silhouette_for_n:
                best_silhouette_for_n = silhouette_avg

    bic_scores.append((n_clusters, best_bic_for_n))
    aic_scores.append((n_clusters, best_aic_for_n))
    silhouette_scores.append((n_clusters, best_silhouette_for_n))

# Plot BIC, AIC, and Silhouette Scores
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

ax[0].plot([x[0] for x in bic_scores], [x[1] for x in bic_scores], marker='o', label="BIC")
ax[0].set_title("BIC Scores")
ax[0].set_xlabel("Number of Clusters")
ax[0].set_ylabel("BIC")

ax[1].plot([x[0] for x in aic_scores], [x[1] for x in aic_scores], marker='o', label="AIC", color='orange')
ax[1].set_title("AIC Scores")
ax[1].set_xlabel("Number of Clusters")
ax[1].set_ylabel("AIC")

ax[2].plot([x[0] for x in silhouette_scores], [x[1] for x in silhouette_scores], marker='o', label="Silhouette Score", color='green')
ax[2].set_title("Silhouette Scores")
ax[2].set_xlabel("Number of Clusters")
ax[2].set_ylabel("Silhouette Score")

plt.show()

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['sci.space', 'rec.sport.baseball', 'talk.politics.mideast'])
documents = newsgroups.data

# Convert text data into a term-frequency matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(documents)

# Fit the LDA model
n_topics = 3  # Number of topics we want to discover
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Display the top words for each topic
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[-num_top_words:]]))
        print("\n")

# Show top 10 words for each topic
display_topics(lda, words, 10)

# Transform the documents to get topic distribution for each document
topic_distribution = lda.transform(X)

# Display the first 5 documents' topic distributions
print("Topic distribution for the first 5 documents:")
print(pd.DataFrame(np.round(topic_distribution[:5], 2), columns=[f"Topic {i+1}" for i in range(n_topics)]))
print("++++++++++++++++++++++++++++++++++++++++++")

# Access the topic-word distribution
topic_word_distribution = lda.components_

# Find the most likely topic for each word
most_likely_topic_for_word = topic_word_distribution.argmax(axis=0)

# Create a DataFrame to display each word and its most likely topic
word_topic_df = pd.DataFrame({
    'Word': words,
    'Most_Likely_Topic': most_likely_topic_for_word
})

# Display the top 20 words with their most likely topic
print(word_topic_df.head(20))

Topic 1:
just israeli jews people com article organization writes israel edu


Topic 2:
host don posting game year article writes com organization edu


Topic 3:
time turks armenia nasa said people armenians turkish armenian space


Topic distribution for the first 5 documents:
   Topic 1  Topic 2  Topic 3
0     0.00     1.00     0.00
1     1.00     0.00     0.00
2     0.36     0.05     0.58
3     0.00     0.69     0.31
4     0.00     0.99     0.00
++++++++++++++++++++++++++++++++++++++++++
          Word  Most_Likely_Topic
0           00                  2
1          000                  2
2         0000                  2
3        00000                  2
4       000021                  0
5       000050                  0
6     00041032                  2
7       000413                  0
8   0004244402                  2
9      0004422                  2
10        0005                  1
11       000th                  1
12         001                  1
13      001555                  0
14      001718                  0
15      002118                  0
16      002214                  2
17     002251w                  1
18        0029                  2
19      003015                  1

This is solely for explaining the idea of the EM algorithm. The following example heavily relies on the initialization, does not provide an optimal solution, and does not converge.¶

Model Selection¶

LDA¶