import seaborn as sns
import matplotlib.pyplot as plt
def generate_swarm_violin(data, x, y, hue):
# Set figure size
=(10,6))
plt.figure(figsize
# Create the beeswarm plot with different colors for each day and time
= sns.swarmplot(x=x, y=y, data=data, palette="Set1", hue=hue, alpha=0.3, size=10)
ax
# Add violin plot to the same axes
=x, y=y, data=data, inner=None, color="white", ax=ax)
sns.violinplot(x
# Hide the legend
ax.legend_.remove()
# Add title and labels
f"{y} by {x} ({hue})")
plt.title(
plt.xlabel(x)
plt.ylabel(y)
# Show the plot
plt.show()
=tips, x="day", y="total_bill", hue="time") generate_swarm_violin(data
Swarm-violin plot
Strip-violin plot
import seaborn as sns
import matplotlib.pyplot as plt
def generate_strip_violin(data, x, y, hue):
# Set figure size
=(7,5))
plt.figure(figsize
# Create the strip plot with different colors for each category
= sns.stripplot(x=x, y=y, data=data, palette="Set1", hue=hue, alpha=0.3, size=10)
ax
# Add violin plot to the same axes
=x, y=y, data=data, inner=None, color="white", ax=ax)
sns.violinplot(x
# Hide the legend
ax.legend_.remove()
# Add title and labels
f"{y} by {x} ({hue})")
plt.title(
plt.xlabel(x)
plt.ylabel(y)
# Show the plot
plt.show()
# Load the data
= sns.load_dataset("tips")
tips
=tips, x="day", y="total_bill", hue="time") generate_strip_violin(data
Strip-violin plot with percentage of category
import seaborn as sns
import matplotlib.pyplot as plt
def generate_strip_violin(data, x, y, hue):
# Set figure size
=(10, 6))
plt.figure(figsize
# Create the strip plot with different colors for each category
= sns.stripplot(x=x, y=y, data=data, palette="Set1", hue=hue, alpha=0.3, size=10)
ax
# Add violin plot to the same axes
=x, y=y, data=data, inner=None, color="white", ax=ax)
sns.violinplot(x
# Get unique categories in the hue variable
= data[hue].unique()
unique_categories
# Add percentage labels to the plot for each category
for i, category in enumerate(unique_categories):
# Get the collection for the current category
= ax.collections[i]
collection
# Get the x and y positions of the points
= collection.get_offsets()[:, 0]
x_pos = collection.get_offsets()[:, 1]
y_pos
# Calculate the percentage of the category in the data
= len(data.loc[(data[x] == category) & (data[hue] == category)]) / len(data) * 100
percentage = f"{percentage:.1f}%"
text =(x_pos.mean(), y_pos.mean()), fontsize=10, ha='center', va='center')
ax.annotate(text, xy
# Hide the legend
ax.legend_.remove()
# Add title and labels
f"{y} by {x} ({hue})")
plt.title(
plt.xlabel(x)
plt.ylabel(y)
# Show the plot
plt.show()
# Load the data
= sns.load_dataset("tips")
tips
=tips, x="day", y="total_bill", hue="day") generate_strip_violin(data
Strip-violin plot
- add percentage of category
- add mean value
- add line graph
import seaborn as sns
import matplotlib.pyplot as plt
def generate_strip_violin(data, x, y, hue):
# Set figure size
=(10, 6))
plt.figure(figsize
# Create the strip plot with different colors for each category
= sns.stripplot(x=x, y=y, data=data, palette="Set1", hue=hue, alpha=0.3, size=10)
ax
# Add violin plot to the same axes
=x, y=y, data=data, inner=None, color="white", ax=ax)
sns.violinplot(x
# Get unique categories in the hue variable
= data[hue].unique()
unique_categories
# Add percentage labels to the plot for each category
= []
xline = []
yline for i, category in enumerate(unique_categories):
# Get the collection for the current category
= ax.collections[i]
collection
# Get the x and y positions of the points
= collection.get_offsets()[:, 0]
x_pos = collection.get_offsets()[:, 1]
y_pos
xline.append(x_pos.mean())# Calculate the percentage of the category in the data
= len(data.loc[(data[x] == category) & (data[hue] == category)]) / len(data) * 100
percentage
# Calculate the mean of the 'numeric_column' within the current category
= data[data[hue] == category][y].mean()
mean_value
yline.append(y_pos.mean())#text = f"{percentage:.1f}%"
= f"(Perc: {percentage:.1f}%) \n(Mean: {y_pos.mean():.2f})"
text =(x_pos.mean(), y_pos.mean()), fontsize=10, ha='center', va='center')
ax.annotate(text, xy
=xline, y=yline, markers=True, linewidth=3, dashes = True, marker='o')
sns.lineplot(x
# Hide the legend
ax.legend_.remove()
# Add title and labels
f"{y} by {x} ({hue})")
plt.title(
plt.xlabel(x)
plt.ylabel(y)
# Show the plot
plt.show()
# Load the data
= sns.load_dataset("tips")
tips
=tips, x="day", y="total_bill", hue="day") generate_strip_violin(data
Plot correlation between 2 categorical variables
import pandas as pd
import numpy as np
# Generate random data
0)
np.random.seed(
# Number of data points
= 1000
n
# Create a DataFrame
= pd.DataFrame({
data 'ltv': np.random.uniform(0, 1, n), # Random 'ltv' values between 0 and 1
'rating': np.random.choice(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], n)
})
# Calculate the 'ltv_decile' using quantiles
= [f'[{i/10:.1f}, {(i+1)/10:.1f})' for i in range(10)]
decile_labels 'ltv_decile'] = pd.qcut(data['ltv'], 10, labels=decile_labels)
data[
# Display the first few rows of the DataFrame
print(data.head())
ltv rating ltv_decile
0 0.548814 B [0.5, 0.6)
1 0.715189 G [0.7, 0.8)
2 0.602763 F [0.6, 0.7)
3 0.544883 E [0.5, 0.6)
4 0.423655 E [0.4, 0.5)
import pandas as pd
import matplotlib.pyplot as plt
def plot_rating_ltv(data, category_var, numeric_var, num_quantiles, list_categories):
# Make a copy of the input data and filter it based on the list of categories
= data.copy()
data = data[data[category_var].isin(list_categories)]
data
# Generate the 'decile' column name based on numeric_var and num_quantiles
= f'{numeric_var}_decile_{num_quantiles}'
decile_col_name
# Calculate the 'decile' using quantiles and the generated column name
= [f'[{i/num_quantiles:.1f}, {(i+1)/num_quantiles:.1f})' for i in range(num_quantiles)]
decile_labels = pd.qcut(data[numeric_var], num_quantiles, labels=decile_labels)
data[decile_col_name]
# Group the data by 'rating' and 'ltv_decile' and count the occurrences
= data.groupby([category_var, decile_col_name]).size().unstack(fill_value=0)
rating_ltv_counts
# Normalize the counts to percentages within each 'rating' group
= rating_ltv_counts.div(rating_ltv_counts.sum(axis=1), axis=0) * 100
rating_ltv_percentages
# Normalize the counts to cumulative percentages within each 'rating' group
= rating_ltv_counts.cumsum(axis=1).div(rating_ltv_counts.sum(axis=1), axis=0) * 100
rating_ltv_cumulative
= rating_ltv_cumulative.rolling(window=2, axis=1, min_periods=1, center=True).mean().fillna(0)
rating_ltv_position
# Plot a bar chart
= rating_ltv_percentages.plot(kind='bar', stacked=True, figsize=(10, 6))
ax
# Add percentage labels to each segment
for column_name, column in rating_ltv_position.items():
for x, y in enumerate(column):
if y != 0: # Exclude labels for segments with 0%
= rating_ltv_percentages[column_name].values[x]
percentage f'{percentage:.1f}%', ha='center', va='top', fontsize=8)
ax.text(x, y,
"Percentage")
ax.set_ylabel(
ax.set_xlabel(category_var)f"Percentage of {decile_col_name} by {category_var}")
ax.set_title(
# Move the legend to the right
='center left', bbox_to_anchor=(1, 0.5))
plt.legend(loc
# Add line plots for each category
for category in rating_ltv_position.columns:
= rating_ltv_percentages[category]
x = rating_ltv_position[category]
y =f'{category_var} {category}', marker='o')
ax.plot(x.index, y.values, label
# Display the plot
plt.show()
# Example usage:
'rating', 'ltv', 5, ['A', 'B', 'C']) plot_rating_ltv(data,