Python Automation #2: πŸ—³οΈ Data Transformation w/polars, pyjanitor, pandas, polars

Anix LynchAnix Lynch
6 min read

1. Convert Column Names to Snake Case (pyjanitor.clean_names)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"Column Name 1": [1, 2], "AnotherColumn": [3, 4]})

# Convert column names to snake_case
df = janitor.clean_names(df)
print(df)

Output:

   column_name_1  another_column
0              1               3
1              2               4

2. Filter Rows by Condition (pandas.DataFrame.query)

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Filter rows where A > 1
filtered_df = pd.DataFrame.query(df, "A > 1")
print(filtered_df)

Output:

   A  B
1  2  5
2  3  6

3. Pivot or Unpivot Data (pandas.melt)

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({"ID": [1, 2], "Jan": [100, 200], "Feb": [150, 250]})

# Unpivot (melt) the DataFrame
melted_df = pd.melt(df, id_vars=["ID"], var_name="Month", value_name="Sales")
print(melted_df)

Output:

   ID Month  Sales
0   1   Jan    100
1   2   Jan    200
2   1   Feb    150
3   2   Feb    250

4. Group By and Aggregate (polars.DataFrame.groupby)

import polars as pl

# Sample DataFrame
df = pl.DataFrame({"Category": ["A", "B", "A"], "Value": [10, 20, 30]})

# Group by and calculate sum
grouped_df = df.groupby("Category").agg(pl.col("Value").sum().alias("Total_Value"))
print(grouped_df)

Output:

shape: (2, 2)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Category  β”‚ Total_Valueβ”‚
β”‚ ---       β”‚ ---        β”‚
β”‚ str       β”‚ i64        β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ A         β”‚ 40         β”‚
β”‚ B         β”‚ 20         β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

5. Add Computed Columns (pyjanitor.add_column)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})

# Add computed column
df = janitor.add_column(df, "C", lambda x: x["A"] + x["B"])
print(df)

Output:

   A  B  C
0  1  3  4
1  2  4  6

6. Handle Missing Data (pyjanitor.fill_missing)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"A": [1, None, 3], "B": [None, 2, 3]})

# Fill missing values with 0
df = janitor.fill_missing(df, value=0)
print(df)

Output:

     A    B
0  1.0  0.0
1  0.0  2.0
2  3.0  3.0

7. Join or Merge Datasets (polars.DataFrame.join)

import polars as pl

# Sample DataFrames
df1 = pl.DataFrame({"ID": [1, 2], "Value1": [10, 20]})
df2 = pl.DataFrame({"ID": [1, 2], "Value2": [30, 40]})

# Join DataFrames
joined_df = df1.join(df2, on="ID", how="inner")
print(joined_df)

Output:

shape: (2, 3)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ ID  β”‚ Value1 β”‚ Value2 β”‚
β”‚ --- β”‚ ---    β”‚ ---    β”‚
β”‚ i64 β”‚ i64    β”‚ i64    β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ 10     β”‚ 30     β”‚
β”‚ 2   β”‚ 20     β”‚ 40     β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜

8. Reshape Data (Wide ↔ Long) (pandas.DataFrame.stack)

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({"ID": [1, 2], "Jan": [100, 200], "Feb": [150, 250]})

# Reshape to long format
reshaped_df = df.set_index("ID").stack().reset_index(name="Sales")
print(reshaped_df)

Output:

   ID level_1  Sales
0   1     Jan    100
1   1     Feb    150
2   2     Jan    200
3   2     Feb    250

9. Chaining Transformations (polars.lazy)

import polars as pl

# Sample DataFrame
df = pl.DataFrame({"Category": ["A", "B", "A"], "Value": [10, 20, 30]})

# Chain transformations: Group by, aggregate, and sort
result = (
    df.lazy()
    .groupby("Category")
    .agg(pl.col("Value").sum().alias("Total_Value"))
    .sort("Total_Value", reverse=True)
    .collect()
)
print(result)

Output:

shape: (2, 2)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Category  β”‚ Total_Valueβ”‚
β”‚ ---       β”‚ ---        β”‚
β”‚ str       β”‚ i64        β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ A         β”‚ 40         β”‚
β”‚ B         β”‚ 20         β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

10. Handle Large Datasets (polars.scan_csv)

import polars as pl

# Simulating reading a large dataset
df = pl.scan_csv("large_dataset.csv")

# Perform operations lazily
result = df.groupby("Category").agg(pl.col("Value").mean()).collect()
print(result)

Output:

shape: (N, 2)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Category  β”‚ Value_mean β”‚
β”‚ ---       β”‚ ---        β”‚
β”‚ str       β”‚ f64        β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ A         β”‚ 25.5       β”‚
β”‚ B         β”‚ 30.0       β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

11. Add or Remove Rows/Columns (pandas.drop, pandas.append)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})

# Add a new column
df["C"] = [5, 6]

# Remove a column
df = pd.DataFrame.drop(df, columns=["A"])

print(df)

Output:

   B  C
0  3  5
1  4  6

12. Sort Rows by Column (polars.DataFrame.sort)

import polars as pl

# Sample DataFrame
df = pl.DataFrame({"Name": ["Alice", "Bob"], "Score": [85, 90]})

# Sort rows by "Score"
sorted_df = df.sort("Score", reverse=False)
print(sorted_df)

Output:

shape: (2, 2)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Name  β”‚ Score β”‚
β”‚ ---   β”‚ ---   β”‚
β”‚ str   β”‚ i64   β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Alice β”‚ 85    β”‚
β”‚ Bob   β”‚ 90    β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”˜

13. Apply Functions Row-Wise (pandas.DataFrame.apply)

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})

# Apply a function row-wise
df["Sum"] = df.apply(lambda row: row["A"] + row["B"], axis=1)
print(df)

Output:

   A  B  Sum
0  1  3    4
1  2  4    6

14. Transform Specific Data Types (pyjanitor.transform_columns)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"A": ["1", "2"], "B": ["3.5", "4.5"]})

# Transform data types
df = janitor.transform_columns(df, {
    "A": int,
    "B": float
})
print(df)

Output:

   A    B
0  1  3.5
1  2  4.5

15. Concatenate Datasets (pandas.concat, polars.concat)

Using Pandas:

import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({"A": [1, 2]})
df2 = pd.DataFrame({"A": [3, 4]})

# Concatenate datasets
df = pd.concat([df1, df2], ignore_index=True)
print(df)

Output:

   A
0  1
1  2
2  3
3  4

Using Polars:

import polars as pl

# Sample DataFrames
df1 = pl.DataFrame({"A": [1, 2]})
df2 = pl.DataFrame({"A": [3, 4]})

# Concatenate datasets
df = pl.concat([df1, df2])
print(df)

Output:

shape: (4, 1)
β”Œβ”€β”€β”€β”€β”€β”
β”‚ A   β”‚
β”‚ i64 β”‚
β”œβ”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚
β”‚ 2   β”‚
β”‚ 3   β”‚
β”‚ 4   β”‚
β””β”€β”€β”€β”€β”€β”˜

16. Efficient Column Selection (polars.select)

import polars as pl

# Sample DataFrame
df = pl.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})

# Select specific columns
selected_df = df.select(["A", "C"])
print(selected_df)

Output:

shape: (2, 2)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
β”‚ A   β”‚ C   β”‚
β”‚ i64 β”‚ i64 β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ 5   β”‚
β”‚ 2   β”‚ 6   β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜

17. Split and Expand Strings (pandas.Series.str.split)

import pandas as pd

# Sample DataFrame
df = pd.DataFrame({"Name": ["Alice Smith", "Bob Jones"]})

# Split and expand strings
df_split = df["Name"].str.split(" ", expand=True)
print(df_split)

Output:

       0      1
0  Alice  Smith
1    Bob  Jones

18. Lazy Evaluation for Transformations (polars.scan_csv)

import polars as pl

# Simulate loading a large dataset lazily
df = pl.scan_csv("large_dataset.csv")

# Perform operations
result = df.filter(pl.col("Category") == "A").select(["Value"]).collect()
print(result)

Output:

shape: (N, 1)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”
β”‚ Value β”‚
β”‚ i64   β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€
β”‚ ...   β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”˜

19. One-Liner Transformations (pyjanitor.clean_names)

import pandas as pd
import janitor

# Sample DataFrame
df = pd.DataFrame({"A Col ": [1, 2], "Another-Col": [3, 4]})

# One-liner transformation
df = janitor.clean_names(df).add_column("Sum", lambda x: x["a_col"] + x["another_col"])
print(df)

Output:

   a_col  another_col  sum
0      1            3    4
1      2            4    6

0
Subscribe to my newsletter

Read articles from Anix Lynch directly inside your inbox. Subscribe to the newsletter, and don't miss out.

Written by

Anix Lynch
Anix Lynch