NLTK (Natural Language Toolkit)
NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning.
Installation
# Basic installation
pip install nltk
# Install with datasets
pip install nltk[all]
# Download specific data
python -c "import nltk; nltk.download('punkt')"
python -c "import nltk; nltk.download('stopwords')"
python -c "import nltk; nltk.download('vader_lexicon')"
python -c "import nltk; nltk.download('wordnet')"
python -c "import nltk; nltk.download('omw-1.4')"
# Download all datasets (large)
python -c "import nltk; nltk.download('all')"
Basic Setup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
# Download required data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
Core Functionality
Text Tokenization
# Sentence tokenization
text = "Hello world. This is NLTK. It's great for NLP!"
sentences = sent_tokenize(text)
print(sentences) # ['Hello world.', 'This is NLTK.', "It's great for NLP!"]
# Word tokenization
words = word_tokenize(text)
print(words) # ['Hello', 'world', '.', 'This', 'is', 'NLTK', '.', ...]
# Custom tokenizers
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer, LineTokenizer
# Only alphabetic tokens
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
# Whitespace tokenization
ws_tokenizer = WhitespaceTokenizer()
tokens = ws_tokenizer.tokenize(text)
Stop Words Removal
from nltk.corpus import stopwords
# Get English stopwords
stop_words = set(stopwords.words('english'))
# Filter stop words
words = word_tokenize("This is a sample sentence with stop words.")
filtered_words = [w for w in words if w.lower() not in stop_words]
print(filtered_words) # ['sample', 'sentence', 'stop', 'words', '.']
# Custom stop words
custom_stops = stop_words.union({'sample', 'example'})
Stemming and Lemmatization
# Porter Stemmer
stemmer = PorterStemmer()
words = ["running", "runs", "ran", "runner"]
stems = [stemmer.stem(word) for word in words]
print(stems) # ['run', 'run', 'ran', 'runner']
# WordNet Lemmatizer (more accurate)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmas) # ['run', 'run', 'run', 'runner']
# Lemmatize with different POS tags
word = "better"
print(lemmatizer.lemmatize(word, pos='a')) # good (adjective)
print(lemmatizer.lemmatize(word, pos='r')) # well (adverb)
Part-of-Speech Tagging
# POS tagging
text = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
print(pos_tags)
# [('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', 'NN'), ...]
# Extract specific POS
nouns = [word for word, pos in pos_tags if pos.startswith('N')]
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
# Universal POS tags
from nltk.tag import pos_tag
from nltk.corpus import brown
universal_tags = pos_tag(tokens, tagset='universal')
Named Entity Recognition
# Named entity chunking
tokens = word_tokenize("Barack Obama was the 44th President of the United States.")
pos_tags = pos_tag(tokens)
entities = ne_chunk(pos_tags)
# Extract named entities
from nltk import Tree
def extract_entities(tree):
entities = []
if hasattr(tree, 'label'):
entities.append((tree.label(), [token for token, pos in tree.leaves()]))
else:
for child in tree:
entities.extend(extract_entities(child))
return entities
named_entities = extract_entities(entities)
print(named_entities) # [('PERSON', ['Barack', 'Obama']), ...]
Common Use Cases
Sentiment Analysis
# VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
texts = [
"I love this product! It's amazing!",
"This is terrible. I hate it.",
"It's okay, nothing special.",
"Best purchase ever! Highly recommend!"
]
for text in texts:
scores = sia.polarity_scores(text)
print(f"Text: {text}")
print(f"Positive: {scores['pos']:.3f}")
print(f"Negative: {scores['neg']:.3f}")
print(f"Neutral: {scores['neu']:.3f}")
print(f"Compound: {scores['compound']:.3f}")
print("-" * 50)
# Simple sentiment classification
def classify_sentiment(text):
score = sia.polarity_scores(text)['compound']
if score >= 0.05:
return 'Positive'
elif score <= -0.05:
return 'Negative'
else:
return 'Neutral'
Text Preprocessing Pipeline
import re
import string
def preprocess_text(text,
lowercase=True,
remove_punctuation=True,
remove_stopwords=True,
lemmatize=True):
"""Complete text preprocessing pipeline"""
# Convert to lowercase
if lowercase:
text = text.lower()
# Remove URLs, emails, mentions
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\@\w+|\#\w+', '', text)
# Remove punctuation
if remove_punctuation:
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenization
tokens = word_tokenize(text)
# Remove stopwords
if remove_stopwords:
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
if lemmatize:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return tokens
# Usage
text = "I'm loving this new product! Check out https://example.com #awesome @company"
processed = preprocess_text(text)
print(processed) # ['loving', 'new', 'product', 'check']
Frequency Analysis
from nltk import FreqDist
from collections import Counter
# Word frequency distribution
text = "the quick brown fox jumps over the lazy dog the fox is quick"
tokens = word_tokenize(text.lower())
fdist = FreqDist(tokens)
# Most common words
print(fdist.most_common(5)) # [('the', 3), ('quick', 2), ('fox', 2), ...]
# Plot frequency distribution
fdist.plot(30, cumulative=False)
# Conditional frequency distribution
from nltk import ConditionalFreqDist
from nltk.corpus import brown
# Frequency by genre
cfdist = ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
# Words most common in news vs romance
cfdist['news'].most_common(10)
cfdist['romance'].most_common(10)
N-grams and Collocations
from nltk import ngrams, collocations
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
# Generate n-grams
text = "the quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text)
# Bigrams
bigrams = list(ngrams(tokens, 2))
print(bigrams[:5]) # [('the', 'quick'), ('quick', 'brown'), ...]
# Trigrams
trigrams = list(ngrams(tokens, 3))
print(trigrams[:3]) # [('the', 'quick', 'brown'), ...]
# Find collocations
from nltk.corpus import text1 # Moby Dick
# Bigram collocations
bigram_measures = collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text1.tokens)
finder.apply_freq_filter(3) # Only bigrams that appear 3+ times
# Best collocations by PMI
collocations = finder.nbest(bigram_measures.pmi, 10)
print(collocations) # [('Sperm', 'Whale'), ('Moby', 'Dick'), ...]
Advanced Features
Custom Text Classification
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
# Prepare movie review dataset
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
# Feature extraction
def document_features(document):
"""Extract features from document"""
words = set(document)
features = {}
# Word presence features
for word in word_features:
features[f'contains({word})'] = (word in words)
return features
# Get most informative words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]
# Create feature sets
featuresets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
# Train classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)
# Evaluate
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f"Accuracy: {accuracy:.3f}")
# Show most informative features
classifier.show_most_informative_features(5)
Working with Corpora
from nltk.corpus import gutenberg, reuters, wordnet
# Gutenberg corpus
print(gutenberg.fileids()) # List of books
emma = gutenberg.words('austen-emma.txt')
print(f"Emma has {len(emma)} words")
# Reuters corpus
print(reuters.categories()) # News categories
finance_docs = reuters.fileids('money-fx')
print(f"Finance articles: {len(finance_docs)}")
# WordNet (semantic dictionary)
from nltk.corpus import wordnet as wn
# Synsets (synonym sets)
dog_synsets = wn.synsets('dog')
print(dog_synsets[0].definition()) # 'a member of the genus Canis...'
# Hypernyms and hyponyms
dog = wn.synset('dog.n.01')
print(dog.hypernyms()) # More general terms
print(dog.hyponyms()) # More specific terms
# Semantic similarity
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
similarity = dog.path_similarity(cat)
print(f"Dog-cat similarity: {similarity:.3f}")
Text Parsing and Chunking
# Grammar-based chunking
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs followed by NP or PP
"""
chunk_parser = nltk.RegexpParser(grammar)
sentence = "The little yellow dog barked at the cat"
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
parsed = chunk_parser.parse(pos_tags)
# Draw parse tree
parsed.draw()
# Extract noun phrases
def extract_noun_phrases(tree):
noun_phrases = []
for subtree in tree:
if hasattr(subtree, 'label') and subtree.label() == 'NP':
np = ' '.join([token for token, pos in subtree.leaves()])
noun_phrases.append(np)
return noun_phrases
nps = extract_noun_phrases(parsed)
print(nps) # ['The little yellow dog', 'the cat']
Integration with Other Libraries
With Pandas for Data Analysis
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
# Create sample dataset
data = {
'review': [
"This product is amazing! Love it!",
"Terrible quality. Very disappointed.",
"It's okay, nothing special.",
"Best purchase I've ever made!"
],
'rating': [5, 1, 3, 5]
}
df = pd.DataFrame(data)
# Add sentiment analysis
sia = SentimentIntensityAnalyzer()
df['sentiment_compound'] = df['review'].apply(
lambda x: sia.polarity_scores(x)['compound']
)
# Add preprocessing
def preprocess_for_analysis(text):
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
return ' '.join(tokens)
df['processed_text'] = df['review'].apply(preprocess_for_analysis)
df['word_count'] = df['processed_text'].apply(lambda x: len(x.split()))
print(df[['review', 'sentiment_compound', 'word_count']])
With Scikit-learn for ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
# Custom tokenizer using NLTK
def nltk_tokenizer(text):
tokens = word_tokenize(text.lower())
return [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
# Create pipeline
pipeline = Pipeline([
('tfidf', TfidfVectorizer(tokenizer=nltk_tokenizer, stop_words='english')),
('classifier', MultinomialNB())
])
# Train model (using movie reviews data)
X = [' '.join(d) for d, c in documents]
y = [c for d, c in documents]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))
Best Practices
Performance Tips
# 1. Cache expensive operations
import functools
@functools.lru_cache(maxsize=1000)
def cached_lemmatize(word, pos='n'):
return lemmatizer.lemmatize(word, pos=pos)
# 2. Use generators for large datasets
def process_large_corpus(file_path):
with open(file_path, 'r') as f:
for line in f:
yield preprocess_text(line.strip())
# 3. Batch processing
def batch_sentiment_analysis(texts, batch_size=100):
sia = SentimentIntensityAnalyzer()
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_results = [sia.polarity_scores(text) for text in batch]
results.extend(batch_results)
return results
# 4. Efficient stopword removal
stop_words = set(stopwords.words('english')) # Create once, reuse many times
def remove_stopwords_efficiently(tokens):
return [token for token in tokens if token.lower() not in stop_words]
Memory Management
# For large text processing
import gc
from collections import deque
def process_large_text_stream(text_stream, window_size=1000):
"""Process large text streams efficiently"""
buffer = deque(maxlen=window_size)
for text in text_stream:
# Process text
processed = preprocess_text(text)
buffer.append(processed)
# Periodic cleanup
if len(buffer) == window_size:
# Do something with buffer
yield list(buffer)
gc.collect() # Force garbage collection
Error Handling
def robust_text_processing(text):
"""Text processing with error handling"""
try:
# Validate input
if not isinstance(text, str):
text = str(text)
if not text.strip():
return []
# Process with fallbacks
try:
tokens = word_tokenize(text)
except:
# Fallback to simple split
tokens = text.split()
# Safe POS tagging
try:
pos_tags = pos_tag(tokens)
except:
pos_tags = [(token, 'NN') for token in tokens]
return pos_tags
except Exception as e:
print(f"Error processing text: {e}")
return []
Real-world Examples
Complete Sentiment Analysis Pipeline
class SentimentAnalyzer:
def __init__(self):
self.sia = SentimentIntensityAnalyzer()
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def preprocess(self, text):
"""Clean and preprocess text"""
# Basic cleaning
text = re.sub(r'http\S+|www\S+', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenization and normalization
tokens = word_tokenize(text.lower())
tokens = [self.lemmatizer.lemmatize(token)
for token in tokens
if token not in self.stop_words and len(token) > 2]
return ' '.join(tokens)
def analyze(self, text):
"""Perform sentiment analysis"""
# Preprocess
clean_text = self.preprocess(text)
# Get sentiment scores
scores = self.sia.polarity_scores(text) # Use original text for better accuracy
# Classify sentiment
compound = scores['compound']
if compound >= 0.05:
sentiment = 'positive'
elif compound <= -0.05:
sentiment = 'negative'
else:
sentiment = 'neutral'
return {
'sentiment': sentiment,
'confidence': abs(compound),
'scores': scores,
'processed_text': clean_text
}
# Usage
analyzer = SentimentAnalyzer()
result = analyzer.analyze("I absolutely love this new product! It's fantastic!")
print(result)
Text Summarization with NLTK
from nltk.tokenize import sent_tokenize
from collections import Counter
import math
def extractive_summarization(text, num_sentences=3):
"""Simple extractive summarization using TF-IDF"""
# Tokenize into sentences
sentences = sent_tokenize(text)
if len(sentences) <= num_sentences:
return text
# Tokenize and preprocess
all_words = []
sentence_words = []
for sentence in sentences:
words = word_tokenize(sentence.lower())
words = [word for word in words if word.isalpha() and word not in stop_words]
sentence_words.append(words)
all_words.extend(words)
# Calculate word frequencies
word_freq = Counter(all_words)
# Calculate sentence scores
sentence_scores = []
for words in sentence_words:
score = sum(word_freq[word] for word in words)
sentence_scores.append(score)
# Get top sentences
top_indices = sorted(range(len(sentence_scores)),
key=lambda i: sentence_scores[i],
reverse=True)[:num_sentences]
# Return sentences in original order
top_indices.sort()
summary_sentences = [sentences[i] for i in top_indices]
return ' '.join(summary_sentences)
# Usage
long_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
concerned with the interactions between computers and human language. In particular, it focuses on programming
computers to process and analyze large amounts of natural language data. The result is a computer capable of
"understanding" the contents of documents, including the contextual nuances of the language within them.
The technology can then accurately extract information and insights contained in the documents as well as
categorize and organize the documents themselves. Challenges in natural language processing frequently involve
speech recognition, natural language understanding, and natural language generation.
"""
summary = extractive_summarization(long_text, num_sentences=2)
print(summary)
This cheat sheet covers the essential aspects of NLTK for natural language processing tasks. The library is particularly strong in academic and research contexts, providing comprehensive tools for text analysis, linguistic processing, and building NLP applications. Its extensive corpus collection and built-in algorithms make it an excellent choice for learning NLP concepts and rapid prototyping.