Sentence-Transformers (UKPLab)

Sentence-Transformers is a Python framework for state-of-the-art sentence, text, and image embeddings. It provides an easy way to compute dense vector representations for sentences, paragraphs, and images, enabling semantic similarity computation, clustering, and semantic search.

Installation

# Basic installation
pip install sentence-transformers

# With optional dependencies
pip install sentence-transformers[train]

# Development version
pip install git+https://github.com/UKPLab/sentence-transformers.git

# With specific backends
pip install sentence-transformers torch torchvision
pip install sentence-transformers tensorflow

Basic Setup

from sentence_transformers import SentenceTransformer, util, InputExample, losses
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

Core Functionality

Loading Models

# Popular pre-trained models
model = SentenceTransformer('all-MiniLM-L6-v2')  # Good balance of quality vs speed
model = SentenceTransformer('all-mpnet-base-v2')  # High quality
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Paraphrase detection
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')  # Question answering

# Multilingual models
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

# Specialized models
model = SentenceTransformer('msmarco-distilbert-base-v4')  # Web search
model = SentenceTransformer('nli-distilroberta-base-v2')  # Natural language inference

# Load from local path
model = SentenceTransformer('/path/to/model')

# Load with specific device
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

Basic Encoding

# Single sentence encoding
sentence = "This is a sample sentence."
embedding = model.encode(sentence)
print(f"Embedding shape: {embedding.shape}")  # (384,) for MiniLM models

# Multiple sentences
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium."
]

embeddings = model.encode(sentences)
print(f"Embeddings shape: {embeddings.shape}")  # (3, 384)

# Batch processing with progress bar
embeddings = model.encode(sentences, show_progress_bar=True)

# Convert to tensor
embeddings = model.encode(sentences, convert_to_tensor=True)
print(type(embeddings))  # torch.Tensor

# Normalize embeddings (for cosine similarity)
embeddings = model.encode(sentences, normalize_embeddings=True)

Similarity Computation

# Compute similarity matrix
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium."
]

embeddings = model.encode(sentences)

# Cosine similarity
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

# Pairwise similarity
similarities = util.pytorch_cos_sim(embeddings, embeddings)

# Find most similar pairs
pairs = []
for i in range(len(similarities)):
    for j in range(i+1, len(similarities)):
        pairs.append((i, j, similarities[i][j].item()))

# Sort by similarity
pairs.sort(key=lambda x: x[2], reverse=True)
print(f"Most similar pair: sentences {pairs[0][0]} and {pairs[0][1]} with score {pairs[0][2]:.4f}")

Common Use Cases

Semantic Search

import numpy as np

# Create a corpus of documents
corpus = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey."
]

# Encode corpus
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Query
queries = [
    "A man is eating pasta.",
    "Someone in a gorilla costume is playing a set of drums.",
    "A cheetah chases prey on across a field."
]

# Find the closest 5 sentences to each query
top_k = min(5, len(corpus))

for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine-similarities between query and corpus
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

    # Sort results
    top_results = torch.topk(cos_scores, k=top_k)

    print(f"\nQuery: {query}")
    print(f"Top {top_k} most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(f"(Score: {score:.4f}) {corpus[idx]}")

Advanced Semantic Search with Faiss

import faiss
import numpy as np

def create_faiss_index(embeddings):
    """Create FAISS index for fast similarity search"""
    dimension = embeddings.shape[1]

    # Create index
    index = faiss.IndexFlatIP(dimension)  # Inner product (cosine with normalized vectors)

    # Normalize embeddings for cosine similarity
    embeddings_np = embeddings.cpu().numpy() if hasattr(embeddings, 'cpu') else embeddings
    faiss.normalize_L2(embeddings_np)

    # Add embeddings to index
    index.add(embeddings_np.astype('float32'))

    return index

def semantic_search_faiss(query, model, index, corpus, top_k=5):
    """Perform semantic search using FAISS"""
    # Encode query
    query_embedding = model.encode([query])

    # Normalize
    faiss.normalize_L2(query_embedding)

    # Search
    scores, indices = index.search(query_embedding.astype('float32'), top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append({
            'score': float(score),
            'text': corpus[idx],
            'index': int(idx)
        })

    return results

# Usage
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
index = create_faiss_index(corpus_embeddings)

query = "A person is eating food"
results = semantic_search_faiss(query, model, index, corpus)

print(f"Query: {query}")
for result in results:
    print(f"Score: {result['score']:.4f} - {result['text']}")

Clustering

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Sample sentences for clustering
sentences = [
    # Sports
    "The football game was exciting.",
    "Basketball is my favorite sport.",
    "Tennis requires good coordination.",

    # Food
    "This pizza tastes amazing.",
    "I love cooking Italian food.",
    "The restaurant serves great pasta.",

    # Technology
    "Machine learning is fascinating.",
    "AI will change the world.",
    "Programming languages are evolving.",

    # Weather
    "It's a beautiful sunny day.",
    "The weather is getting cold.",
    "Rain is expected tomorrow."
]

# Get embeddings
embeddings = model.encode(sentences)

# Perform clustering
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_assignments = kmeans.fit_predict(embeddings)

# Group sentences by cluster
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignments):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []
    clustered_sentences[cluster_id].append(sentences[sentence_id])

# Print results
for cluster_id, cluster_sentences in clustered_sentences.items():
    print(f"Cluster {cluster_id + 1}:")
    for sentence in cluster_sentences:
        print(f"  - {sentence}")
    print()

Paraphrase Detection

def find_paraphrases(sentences, threshold=0.7):
    """Find paraphrases in a list of sentences"""
    embeddings = model.encode(sentences)
    similarity_matrix = util.cos_sim(embeddings, embeddings)

    paraphrases = []
    for i in range(len(sentences)):
        for j in range(i+1, len(sentences)):
            similarity = similarity_matrix[i][j].item()
            if similarity > threshold:
                paraphrases.append({
                    'sentence1': sentences[i],
                    'sentence2': sentences[j],
                    'similarity': similarity
                })

    return sorted(paraphrases, key=lambda x: x['similarity'], reverse=True)

# Example sentences with some paraphrases
test_sentences = [
    "The cat is sleeping on the couch.",
    "A feline is resting on the sofa.",
    "The dog is barking loudly.",
    "The weather is nice today.",
    "It's a beautiful day outside.",
    "The car is red.",
    "The canine is making loud sounds."
]

paraphrases = find_paraphrases(test_sentences, threshold=0.6)

print("Potential paraphrases found:")
for para in paraphrases:
    print(f"Similarity: {para['similarity']:.3f}")
    print(f"1: {para['sentence1']}")
    print(f"2: {para['sentence2']}")
    print("-" * 50)

Question Answering with Retrieval

def qa_retrieval_system(questions, contexts, model, top_k=3):
    """Simple QA retrieval system using sentence embeddings"""

    # Encode all contexts
    context_embeddings = model.encode(contexts, convert_to_tensor=True)

    results = []
    for question in questions:
        # Encode question
        question_embedding = model.encode(question, convert_to_tensor=True)

        # Find most similar contexts
        similarities = util.cos_sim(question_embedding, context_embeddings)[0]
        top_indices = torch.topk(similarities, k=min(top_k, len(contexts)))[1]

        # Get top contexts
        top_contexts = [contexts[idx] for idx in top_indices]
        top_scores = [similarities[idx].item() for idx in top_indices]

        results.append({
            'question': question,
            'top_contexts': list(zip(top_contexts, top_scores))
        })

    return results

# Example usage
contexts = [
    "Paris is the capital of France and its largest city.",
    "London is the capital of England and the United Kingdom.",
    "Berlin is the capital of Germany.",
    "Madrid is the capital of Spain.",
    "Rome is the capital of Italy and home to Vatican City.",
    "Tokyo is the capital of Japan and the world's most populous metropolitan area."
]

questions = [
    "What is the capital of France?",
    "Which city is the capital of Germany?",
    "Tell me about the capital of Italy."
]

qa_results = qa_retrieval_system(questions, contexts, model)

for result in qa_results:
    print(f"Question: {result['question']}")
    print("Most relevant contexts:")
    for context, score in result['top_contexts']:
        print(f"  Score: {score:.3f} - {context}")
    print()

Advanced Features

Custom Model Training

from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader

# Prepare training data
train_examples = [
    InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Completely different'], label=0.3)
]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Define loss function
train_loss = losses.CosineSimilarityLoss(model)

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    output_path='./my-sentence-transformer'
)

# For triplet training (anchor, positive, negative)
train_examples_triplet = [
    InputExample(texts=['Anchor sentence', 'Positive sentence', 'Negative sentence'])
]

train_dataloader_triplet = DataLoader(train_examples_triplet, shuffle=True, batch_size=16)
train_loss_triplet = losses.TripletLoss(model)

model.fit(
    train_objectives=[(train_dataloader_triplet, train_loss_triplet)],
    epochs=1,
    output_path='./my-triplet-model'
)

Cross-Encoders for Reranking

from sentence_transformers.cross_encoder import CrossEncoder

# Load cross-encoder model
cross_encoder = CrossEncoder('ms-marco-MiniLM-L-6-v2')

def rerank_results(query, candidates, cross_encoder, top_k=5):
    """Rerank search results using a cross-encoder"""

    # Create query-candidate pairs
    pairs = [[query, candidate] for candidate in candidates]

    # Get cross-encoder scores
    scores = cross_encoder.predict(pairs)

    # Sort by score
    ranked_results = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

    return ranked_results[:top_k]

# Example: Two-stage retrieval and reranking
def two_stage_search(query, corpus, bi_encoder, cross_encoder, 
                     retrieve_top_k=20, rerank_top_k=5):
    """Two-stage search: bi-encoder retrieval + cross-encoder reranking"""

    # Stage 1: Fast retrieval with bi-encoder
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    corpus_embeddings = bi_encoder.encode(corpus, convert_to_tensor=True)

    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=min(retrieve_top_k, len(corpus)))

    # Get candidate texts
    candidates = [corpus[idx] for idx in top_results[1]]

    # Stage 2: Reranking with cross-encoder
    reranked = rerank_results(query, candidates, cross_encoder, rerank_top_k)

    return reranked

# Usage
query = "Information about machine learning"
corpus = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with many layers.",
    "Natural language processing helps computers understand text.",
    "Computer vision enables machines to interpret visual information.",
    "Reinforcement learning trains agents through trial and error."
]

bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
cross_encoder = CrossEncoder('ms-marco-MiniLM-L-6-v2')

results = two_stage_search(query, corpus, bi_encoder, cross_encoder)

print(f"Query: {query}")
print("Reranked results:")
for i, (text, score) in enumerate(results, 1):
    print(f"{i}. Score: {score:.4f} - {text}")

Multi-lingual Embeddings

# Load multilingual model
multilingual_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Sentences in different languages
sentences = [
    "Hello, how are you?",           # English
    "Hola, ¿cómo estás?",           # Spanish
    "Bonjour, comment allez-vous?",  # French
    "Hallo, wie geht es dir?",      # German
    "Ciao, come stai?",             # Italian
    "こんにちは、元気ですか？",           # Japanese
    "你好，你好吗？"                   # Chinese
]

# Get embeddings
embeddings = multilingual_model.encode(sentences)

# Compute similarity matrix
similarity_matrix = util.cos_sim(embeddings, embeddings)

print("Cross-lingual similarity matrix:")
for i, sent1 in enumerate(sentences):
    for j, sent2 in enumerate(sentences):
        if i != j:
            sim = similarity_matrix[i][j].item()
            if sim > 0.5:  # Only show high similarities
                print(f"Similarity: {sim:.3f}")
                print(f"  '{sent1}' <-> '{sent2}'")

Working with Images (CLIP)

from sentence_transformers import SentenceTransformer
from PIL import Image
import requests
from io import BytesIO

# Load CLIP model
clip_model = SentenceTransformer('clip-ViT-B-32')

# Load images
image_urls = [
    "https://example.com/cat.jpg",
    "https://example.com/dog.jpg"
]

images = []
for url in image_urls:
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    images.append(img)

# Text descriptions
texts = [
    "A photo of a cat",
    "A photo of a dog",
    "A picture of a bird",
    "An image of a car"
]

# Get embeddings
image_embeddings = clip_model.encode(images)
text_embeddings = clip_model.encode(texts)

# Compute similarity between images and texts
similarities = util.cos_sim(image_embeddings, text_embeddings)

print("Image-Text Similarities:")
for i, img_url in enumerate(image_urls):
    print(f"\nImage {i+1}: {img_url}")
    for j, text in enumerate(texts):
        sim = similarities[i][j].item()
        print(f"  '{text}': {sim:.3f}")

Integration with Other Libraries

With Pandas for Data Analysis

import pandas as pd

# Create sample dataset
data = {
    'text': [
        "I love this product! It's amazing!",
        "Great quality and fast shipping.",
        "Terrible experience, would not recommend.",
        "Average product, nothing special.",
        "Outstanding customer service!"
    ],
    'rating': [5, 4, 1, 3, 5]
}

df = pd.DataFrame(data)

# Add embeddings
embeddings = model.encode(df['text'].tolist())
df['embedding'] = embeddings.tolist()

# Find similar reviews
def find_similar_reviews(query_text, df, model, top_k=3):
    query_embedding = model.encode([query_text])

    # Compute similarities
    similarities = []
    for i, row in df.iterrows():
        sim = cosine_similarity([query_embedding[0]], [row['embedding']])[0][0]
        similarities.append(sim)

    df['similarity'] = similarities
    return df.nlargest(top_k, 'similarity')[['text', 'rating', 'similarity']]

# Usage
query = "excellent customer support"
similar_reviews = find_similar_reviews(query, df, model)
print(similar_reviews)

With Streamlit for Web Apps

import streamlit as st
import plotly.express as px
from sklearn.manifold import TSNE

st.title("Sentence Similarity Explorer")

# Text input
user_texts = st.text_area("Enter sentences (one per line):", 
                         value="The weather is nice today.\nIt's a beautiful day.\nThe car is red.")

if user_texts:
    sentences = [s.strip() for s in user_texts.split('\n') if s.strip()]

    # Compute embeddings
    embeddings = model.encode(sentences)

    # Compute similarity matrix
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()

    # Display similarity matrix
    st.subheader("Similarity Matrix")
    fig = px.imshow(similarity_matrix, 
                    labels=dict(x="Sentences", y="Sentences"),
                    text_auto=True)
    st.plotly_chart(fig)

    # 2D visualization using t-SNE
    if len(sentences) > 2:
        st.subheader("2D Embedding Visualization")
        tsne = TSNE(n_components=2, random_state=42)
        embeddings_2d = tsne.fit_transform(embeddings)

        fig_scatter = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
                               hover_data=[sentences],
                               title="Sentence Embeddings (t-SNE)")
        st.plotly_chart(fig_scatter)

Best Practices

Performance Optimization

# 1. Use appropriate model sizes
models_by_performance = {
    'fastest': 'all-MiniLM-L6-v2',
    'balanced': 'all-mpnet-base-v2', 
    'highest_quality': 'sentence-transformers/gtr-t5-large'
}

# 2. Batch processing for large datasets
def encode_large_dataset(texts, model, batch_size=32):
    """Efficiently encode large datasets"""
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.extend(batch_embeddings)

    return np.array(embeddings)

# 3. Use appropriate precision
embeddings = model.encode(sentences, precision='float32')  # vs 'float64'

# 4. Normalize embeddings if using cosine similarity
embeddings = model.encode(sentences, normalize_embeddings=True)

# 5. Use GPU when available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Memory Management

import gc
from typing import Iterator, List

def process_large_corpus(corpus: List[str], 
                        model: SentenceTransformer,
                        batch_size: int = 1000) -> Iterator[np.ndarray]:
    """Process large corpus in chunks to manage memory"""

    for i in range(0, len(corpus), batch_size):
        batch = corpus[i:i + batch_size]

        # Encode batch
        embeddings = model.encode(batch, convert_to_numpy=True)

        yield embeddings

        # Clean up
        gc.collect()

# Usage for very large datasets
def save_embeddings_chunked(corpus, model, output_path, batch_size=1000):
    """Save embeddings for large corpus in chunks"""
    all_embeddings = []

    for chunk_embeddings in process_large_corpus(corpus, model, batch_size):
        all_embeddings.append(chunk_embeddings)

    # Concatenate all embeddings
    final_embeddings = np.vstack(all_embeddings)
    np.save(output_path, final_embeddings)

    return final_embeddings

Model Selection Guidelines

def recommend_model(use_case, performance_priority='balanced'):
    """Recommend model based on use case and performance requirements"""

    recommendations = {
        'semantic_search': {
            'fast': 'all-MiniLM-L6-v2',
            'balanced': 'all-mpnet-base-v2',
            'quality': 'sentence-transformers/gtr-t5-large'
        },
        'question_answering': {
            'fast': 'multi-qa-MiniLM-L6-cos-v1',
            'balanced': 'multi-qa-mpnet-base-cos-v1',
            'quality': 'sentence-transformers/gtr-t5-xl'
        },
        'paraphrase_detection': {
            'fast': 'paraphrase-MiniLM-L6-v2',
            'balanced': 'paraphrase-mpnet-base-v2',
            'quality': 'sentence-transformers/gtr-t5-large'
        },
        'multilingual': {
            'fast': 'paraphrase-multilingual-MiniLM-L12-v2',
            'balanced': 'sentence-transformers/LaBSE',
            'quality': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
        }
    }

    return recommendations.get(use_case, {}).get(performance_priority, 'all-MiniLM-L6-v2')

# Usage
model_name = recommend_model('semantic_search', 'fast')
print(f"Recommended model: {model_name}")

Real-world Examples

Complete Semantic Search Engine

import json
import pickle
from pathlib import Path
from typing import List, Dict, Any

class SemanticSearchEngine:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.documents = []
        self.embeddings = None
        self.index = None

    def add_documents(self, documents: List[Dict[str, Any]]):
        """Add documents to the search engine"""
        self.documents.extend(documents)

        # Extract text for embedding
        texts = [doc.get('text', str(doc)) for doc in documents]

        # Compute embeddings
        new_embeddings = self.model.encode(texts, convert_to_tensor=True)

        if self.embeddings is None:
            self.embeddings = new_embeddings
        else:
            self.embeddings = torch.cat([self.embeddings, new_embeddings], dim=0)

    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Search for similar documents"""
        if self.embeddings is None:
            return []

        # Encode query
        query_embedding = self.model.encode(query, convert_to_tensor=True)

        # Compute similarities
        cos_scores = util.cos_sim(query_embedding, self.embeddings)[0]

        # Get top results
        top_results = torch.topk(cos_scores, k=min(top_k, len(self.documents)))

        results = []
        for score, idx in zip(top_results[0], top_results[1]):
            doc = self.documents[idx.item()].copy()
            doc['similarity_score'] = score.item()
            results.append(doc)

        return results

    def save(self, path: str):
        """Save the search engine state"""
        state = {
            'documents': self.documents,
            'embeddings': self.embeddings.cpu().numpy() if self.embeddings is not None else None,
            'model_name': self.model._modules['0'].auto_model.config._name_or_path
        }

        with open(path, 'wb') as f:
            pickle.dump(state, f)

    def load(self, path: str):
        """Load the search engine state"""
        with open(path, 'rb') as f:
            state = pickle.load(f)

        self.documents = state['documents']
        if state['embeddings'] is not None:
            self.embeddings = torch.tensor(state['embeddings'])

# Usage example
search_engine = SemanticSearchEngine()

# Add documents
documents = [
    {"title": "Machine Learning Basics", "text": "Introduction to machine learning algorithms and concepts", "category": "AI"},
    {"title": "Python Programming", "text": "Learn Python programming from scratch", "category": "Programming"},
    {"title": "Data Science Guide", "text": "Comprehensive guide to data science and analytics", "category": "Data"},
    {"title": "Neural Networks", "text": "Deep learning and neural network architectures", "category": "AI"},
]

search_engine.add_documents(documents)

# Search
results = search_engine.search("artificial intelligence and machine learning", top_k=3)

print("Search Results:")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['title']} (Score: {result['similarity_score']:.3f})")
    print(f"   Category: {result['category']}")
    print(f"   Text: {result['text']}")
    print()

# Save for later use
search_engine.save("my_search_engine.pkl")

Document Clustering and Topic Analysis

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

class DocumentAnalyzer:
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def analyze_documents(self, documents, num_clusters=5):
        """Analyze documents: embeddings, clustering, and visualization"""

        # Get embeddings
        embeddings = self.model.encode(documents, show_progress_bar=True)

        # Perform clustering
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(embeddings)

        # Dimensionality reduction for visualization
        pca = PCA(n_components=2, random_state=42)
        embeddings_2d = pca.fit_transform(embeddings)

        # Create results
        results = {
            'documents': documents,
            'embeddings': embeddings,
            'cluster_labels': cluster_labels,
            'embeddings_2d': embeddings_2d,
            'cluster_centers': kmeans.cluster_centers_
        }

        return results

    def visualize_clusters(self, results):
        """Visualize document clusters"""
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(results['embeddings_2d'][:, 0], 
                            results['embeddings_2d'][:, 1], 
                            c=results['cluster_labels'], 
                            cmap='tab10', 
                            alpha=0.7)

        plt.colorbar(scatter)
        plt.title('Document Clusters (2D PCA)')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')

        # Add some document texts as annotations
        for i in range(0, len(results['documents']), max(1, len(results['documents'])//10)):
            plt.annotate(results['documents'][i][:30] + '...', 
                        (results['embeddings_2d'][i, 0], results['embeddings_2d'][i, 1]),
                        fontsize=8, alpha=0.7)

        plt.tight_layout()
        plt.show()

    def get_cluster_summaries(self, results, max_examples=3):
        """Get representative examples for each cluster"""
        summaries = {}

        for cluster_id in range(len(set(results['cluster_labels']))):
            # Get documents in this cluster
            cluster_docs = [doc for i, doc in enumerate(results['documents']) 
                          if results['cluster_labels'][i] == cluster_id]

            # Get embeddings for this cluster
            cluster_embeddings = results['embeddings'][results['cluster_labels'] == cluster_id]
            cluster_center = results['cluster_centers'][cluster_id]

            # Find most representative documents (closest to center)
            distances = np.linalg.norm(cluster_embeddings - cluster_center, axis=1)
            closest_indices = np.argsort(distances)[:max_examples]

            representative_docs = [cluster_docs[idx] for idx in closest_indices]

            summaries[f'Cluster {cluster_id}'] = {
                'size': len(cluster_docs),
                'representative_docs': representative_docs
            }

        return summaries

# Example usage
analyzer = DocumentAnalyzer()

# Sample documents (in practice, load from your data source)
documents = [
    "Machine learning algorithms for data analysis",
    "Python programming tutorial for beginners",
    "Natural language processing with transformers",
    "Web development using React and Node.js",
    "Deep learning neural networks architecture",
    "Database design and SQL optimization",
    "Computer vision and image recognition",
    "JavaScript frameworks comparison",
    "Data science workflow and best practices",
    "Mobile app development with Flutter"
]

# Analyze documents
results = analyzer.analyze_documents(documents, num_clusters=3)

# Visualize
analyzer.visualize_clusters(results)

# Get cluster summaries
summaries = analyzer.get_cluster_summaries(results)
print("Cluster Analysis:")
for cluster_name, info in summaries.items():
    print(f"\n{cluster_name} ({info['size']} documents):")
    for doc in info['representative_docs']:
        print(f"  - {doc}")

This comprehensive cheat sheet covers the essential aspects of Sentence-Transformers. The library excels at creating meaningful vector representations of text, enabling powerful semantic search, similarity computation, and clustering applications. Its ease of use and extensive model collection make it ideal for both research and production use cases.