Polars
Polars is a blazingly fast DataFrame library implemented in Rust, offering lazy and eager execution, multi-threading, and a powerful expression API for Python.
Quick Start
Installation
pip install polars
# With optional dependencies
pip install polars[all]
# For cloud storage support
pip install polars[aws,azure,gcp]
Basic Import
import polars as pl
DataFrame Creation
From Dictionary
df = pl.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"city": ["NY", "LA", "Chicago"]
})
From Lists
df = pl.DataFrame([
["Alice", 25, "NY"],
["Bob", 30, "LA"],
["Charlie", 35, "Chicago"]
], schema=["name", "age", "city"])
From NumPy/Pandas
import numpy as np
import pandas as pd
# From NumPy
df = pl.from_numpy(np_array, schema=["col1", "col2"])
# From Pandas
df = pl.from_pandas(pd_df)
Data Loading & Saving
CSV Operations
# Read CSV
df = pl.read_csv("data.csv")
# Scan CSV (lazy loading)
lazy_df = pl.scan_csv("data.csv")
# Write CSV
df.write_csv("output.csv")
# Advanced reading with options
df = pl.read_csv(
"data.csv",
separator=",",
has_header=True,
dtypes={"age": pl.Int64, "salary": pl.Float64},
null_values=["NULL", "NA", ""],
try_parse_dates=True
)
Parquet Operations
# Read Parquet
df = pl.read_parquet("data.parquet")
# Scan Parquet (lazy loading)
lazy_df = pl.scan_parquet("data.parquet")
# Write Parquet
df.write_parquet("output.parquet")
# Read with filters (predicate pushdown)
df = pl.scan_parquet("data.parquet").filter(
pl.col("age") > 25
).collect()
JSON Operations
# Read JSON
df = pl.read_json("data.json")
# Read NDJSON (newline-delimited JSON)
df = pl.read_ndjson("data.ndjson")
# Write JSON
df.write_json("output.json")
df.write_ndjson("output.ndjson")
Cloud Storage
# Read from S3
df = pl.read_parquet("s3://bucket/file.parquet")
# Read from Azure
df = pl.read_csv("azure://container/file.csv")
# Read from Google Cloud
df = pl.read_parquet("gs://bucket/file.parquet")
Data Selection & Filtering
Column Selection
# Select columns
df.select(["name", "age"])
df.select(pl.col("name", "age"))
# Select with expressions
df.select([
pl.col("name"),
pl.col("age").alias("years"),
pl.col("salary") * 1.1 # 10% increase
])
# Select by data type
df.select(pl.col(pl.Utf8)) # String columns
df.select(pl.col(pl.Int64)) # Integer columns
Row Filtering
# Basic filtering
df.filter(pl.col("age") > 25)
# Multiple conditions
df.filter(
(pl.col("age") > 25) &
(pl.col("city") == "NY")
)
# String operations
df.filter(pl.col("name").str.contains("A"))
df.filter(pl.col("name").str.starts_with("A"))
df.filter(pl.col("name").str.ends_with("e"))
# Null filtering
df.filter(pl.col("age").is_not_null())
df.filter(pl.col("name").is_null())
First/Last Rows
df.head(5) # First 5 rows
df.tail(5) # Last 5 rows
df.limit(10) # First 10 rows
df.slice(5, 10) # Rows 5-14
Data Transformation
Adding/Modifying Columns
# Add new columns
df = df.with_columns([
pl.col("age").alias("years"),
(pl.col("salary") * 1.1).alias("new_salary"),
pl.lit("employee").alias("type")
])
# Conditional column creation
df = df.with_columns(
pl.when(pl.col("age") > 30)
.then(pl.lit("Senior"))
.otherwise(pl.lit("Junior"))
.alias("level")
)
String Operations
df = df.with_columns([
pl.col("name").str.to_uppercase().alias("name_upper"),
pl.col("name").str.to_lowercase().alias("name_lower"),
pl.col("name").str.len().alias("name_length"),
pl.col("text").str.replace("old", "new"),
pl.col("text").str.split(" ").alias("words")
])
Mathematical Operations
df = df.with_columns([
pl.col("value") + 10,
pl.col("value") * 2,
pl.col("value").sqrt(),
pl.col("value").log(),
pl.col("value").abs(),
pl.col("value").round(2)
])
Date/Time Operations
df = df.with_columns([
pl.col("date").dt.year().alias("year"),
pl.col("date").dt.month().alias("month"),
pl.col("date").dt.day().alias("day"),
pl.col("date").dt.weekday().alias("weekday"),
pl.col("timestamp").dt.hour().alias("hour")
])
Aggregation & Grouping
Basic Aggregations
# Single column aggregations
df.select([
pl.col("age").sum(),
pl.col("age").mean(),
pl.col("age").median(),
pl.col("age").min(),
pl.col("age").max(),
pl.col("age").std(),
pl.col("age").var()
])
Group By Operations
# Basic groupby
result = df.group_by("city").agg([
pl.col("age").mean().alias("avg_age"),
pl.col("salary").sum().alias("total_salary"),
pl.col("name").count().alias("count")
])
# Multiple groupby columns
result = df.group_by(["city", "department"]).agg([
pl.col("salary").mean(),
pl.col("age").max()
])
# Complex aggregations
result = df.group_by("city").agg([
pl.col("age").filter(pl.col("age") > 25).mean(),
pl.col("salary").quantile(0.95),
pl.col("name").n_unique()
])
Window Functions
df = df.with_columns([
pl.col("salary").sum().over("department").alias("dept_total"),
pl.col("salary").rank().over("department").alias("salary_rank"),
pl.col("age").mean().over("city").alias("city_avg_age"),
pl.col("sales").rolling_mean(window_size=3).alias("rolling_avg")
])
Joining Operations
Basic Joins
# Inner join
result = df1.join(df2, on="id", how="inner")
# Left join
result = df1.join(df2, on="id", how="left")
# Outer join
result = df1.join(df2, on="id", how="outer")
# Different column names
result = df1.join(df2, left_on="user_id", right_on="id")
Advanced Joins
# Multiple columns
result = df1.join(df2, on=["id", "date"])
# As-of join (temporal join)
result = df1.join_asof(
df2,
on="timestamp",
strategy="backward" # or "forward", "nearest"
)
# Cross join
result = df1.join(df2, how="cross")
# Semi and anti joins
result = df1.join(df2, on="id", how="semi") # Keep matching rows
result = df1.join(df2, on="id", how="anti") # Keep non-matching rows
Data Manipulation
Sorting
# Single column
df.sort("age")
df.sort("age", descending=True)
# Multiple columns
df.sort(["city", "age"], descending=[False, True])
# By expression
df.sort(pl.col("salary") * pl.col("bonus"))
Unique Values
# Get unique rows
df.unique()
# Unique based on subset
df.unique(subset=["city"])
# Drop duplicates
df = df.unique(maintain_order=True)
Reshaping
# Pivot
pivot_df = df.pivot(
values="salary",
index="name",
columns="year",
aggregate="sum"
)
# Melt (unpivot)
melted = df.melt(
id_vars=["name", "city"],
value_vars=["salary_2022", "salary_2023"],
variable_name="year",
value_name="salary"
)
Handling Missing Data
# Fill null values
df = df.fill_null(0) # Fill with constant
df = df.fill_null(pl.col("age").median()) # Fill with median
# Drop null values
df = df.drop_nulls() # Drop rows with any nulls
df = df.drop_nulls(subset=["age", "salary"]) # Drop if specific columns null
# Forward/backward fill
df = df.fill_null(strategy="forward")
df = df.fill_null(strategy="backward")
Lazy Evaluation
Creating Lazy Frames
# From scan operations
lazy_df = pl.scan_csv("large_file.csv")
# Convert DataFrame to LazyFrame
lazy_df = df.lazy()
Lazy Operations
# Chain operations without execution
query = (
pl.scan_csv("data.csv")
.filter(pl.col("age") > 25)
.group_by("city")
.agg(pl.col("salary").mean())
.sort("salary", descending=True)
)
# Execute the query
result = query.collect()
# Show query plan
print(query.explain())
Streaming
# For very large datasets
result = (
pl.scan_csv("huge_file.csv")
.filter(pl.col("value") > 100)
.group_by("category")
.agg(pl.col("amount").sum())
.collect(streaming=True)
)
Column Operations
List Operations
df = df.with_columns([
pl.col("list_col").list.len().alias("list_length"),
pl.col("list_col").list.get(0).alias("first_item"),
pl.col("list_col").list.sum().alias("list_sum"),
pl.col("list_col").list.unique().alias("unique_items")
])
# Explode lists to rows
df = df.explode("list_col")
Struct Operations
# Create struct
df = df.with_columns(
pl.struct(["name", "age"]).alias("person")
)
# Extract struct fields
df = df.with_columns([
pl.col("person").struct.field("name"),
pl.col("person").struct.field("age")
])
Categorical Operations
# Convert to categorical
df = df.with_columns(
pl.col("category").cast(pl.Categorical)
)
# Get categories
df.select(pl.col("category").cat.get_categories())
Performance Optimization
Data Types
# Optimize data types for memory
df = df.with_columns([
pl.col("small_int").cast(pl.Int8), # vs Int64
pl.col("category").cast(pl.Categorical), # vs String
pl.col("flag").cast(pl.Boolean) # vs String
])
Query Optimization
# Use lazy evaluation for large datasets
lazy_result = (
pl.scan_parquet("large_data.parquet")
.select(["needed_col1", "needed_col2"]) # Project early
.filter(pl.col("date") > "2023-01-01") # Filter early
.collect()
)
# Use predicate pushdown with file formats
df = pl.scan_parquet("data.parquet").filter(
pl.col("partition_col") == "value"
).collect()
Memory Management
# For memory-constrained environments
df = df.rechunk() # Optimize memory layout
# Process in chunks
for batch in pl.read_csv_batched("large_file.csv", batch_size=10000):
result = process_batch(batch)
Integration with Other Libraries
Pandas Integration
# Convert to pandas
pandas_df = df.to_pandas()
# From pandas with zero-copy (when possible)
polars_df = pl.from_pandas(pandas_df, rechunk=False)
NumPy Integration
# To NumPy
numpy_array = df.to_numpy()
# From NumPy
df = pl.from_numpy(numpy_array, schema=["col1", "col2"])
Arrow Integration
# To Arrow
arrow_table = df.to_arrow()
# From Arrow
df = pl.from_arrow(arrow_table)
SQL Interface
Basic SQL
# Execute SQL on DataFrame
result = df.sql("""
SELECT name, age, salary * 1.1 as new_salary
FROM self
WHERE age > 25
ORDER BY salary DESC
""")
# Global SQL context
result = pl.sql("""
SELECT df1.name, df2.department
FROM df1
JOIN df2 ON df1.id = df2.user_id
""", eager=True)
Table Functions in SQL
# Read files directly in SQL
result = pl.sql("""
SELECT * FROM read_csv('data.csv')
WHERE age > 25
""")
Common Patterns
Data Cleaning Pipeline
cleaned_df = (
pl.scan_csv("raw_data.csv")
.filter(pl.col("id").is_not_null())
.with_columns([
pl.col("name").str.strip_chars().str.to_title(),
pl.col("age").cast(pl.Int32),
pl.col("email").str.to_lowercase()
])
.filter(pl.col("age").is_between(18, 100))
.unique(subset=["id"])
.collect()
)
Time Series Analysis
time_series_df = (
df.sort("timestamp")
.with_columns([
pl.col("value").rolling_mean(window_size=7).alias("7day_avg"),
pl.col("value").rolling_std(window_size=7).alias("7day_std"),
pl.col("value").pct_change().alias("pct_change")
])
.filter(pl.col("timestamp") > pl.date(2023, 1, 1))
)
Business Metrics
metrics = (
sales_df
.group_by(["region", "product"])
.agg([
pl.col("revenue").sum().alias("total_revenue"),
pl.col("quantity").sum().alias("total_quantity"),
pl.col("customer_id").n_unique().alias("unique_customers"),
(pl.col("revenue") / pl.col("quantity")).mean().alias("avg_price")
])
.with_columns(
(pl.col("total_revenue") / pl.col("total_revenue").sum()).alias("revenue_share")
)
.sort("total_revenue", descending=True)
)
Error Handling & Debugging
Common Errors
# Schema mismatch in joins
try:
result = df1.join(df2, on="id")
except pl.SchemaError as e:
print(f"Schema error: {e}")
# Type casting errors
df = df.with_columns(
pl.col("maybe_numeric").cast(pl.Float64, strict=False)
)
Debugging
# Inspect intermediate results
df.glimpse() # Overview of DataFrame
df.describe() # Statistical summary
# Check lazy query plan
lazy_df.explain()
lazy_df.show_graph() # Visual representation
Best Practices
- Use lazy evaluation for large datasets and complex queries
- Project and filter early to reduce memory usage
- Specify data types explicitly when reading data
- Use categorical types for string columns with limited unique values
- Leverage predicate pushdown with file formats like Parquet
- Chain operations in a single expression when possible
- Use window functions instead of self-joins for analytical queries
- Prefer scan_* functions over read_* for large files
- Use streaming for datasets larger than memory
- Consider partitioning for very large datasets
Common Gotchas
- Column selection:
pl.col("name")vs"name"- use expressions for consistency - Lazy evaluation: Remember to
.collect()lazy frames - Boolean operators: Use
&and|instead ofandandor - String operations: Chain
.strmethods properly - Join suffixes: Specify suffixes to avoid column name conflicts
- Memory usage: Large eager DataFrames can consume significant memory
- Type inference: Be explicit with data types for predictable behavior
This cheat sheet covers the most commonly used Polars operations. For advanced features and detailed documentation, visit the official Polars documentation.