import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
base_color = sns.color_palette()[0]


leads_df = pd.read_csv('clean/leads_basic_details_clean.csv')
interactions_df = pd.read_csv('clean/leads_interaction_details_clean.csv')
demos_df = pd.read_csv('clean/leads_demo_watched_details_clean.csv')
no_interest_df = pd.read_csv('clean/leads_reasons_for_no_interest_clean.csv')


def plot_bar_chart(df,col,xlabel,y=False,ninety=True):
    order = df[col].value_counts().index
    if y:
        sns.countplot(y=col,data=df,order=order,color=base_color)
        plt.ylabel(xlabel)
        plt.xlabel("Number of leads")
    else:
        sns.countplot(x=col,data=df,order=order,color=base_color)
        plt.xlabel(xlabel)
        plt.ylabel("Number of leads")
        if ninety:
            plt.xticks(rotation=90);


lead_end = no_interest_df.query("stage_dropped == 'lead'")
plot_bar_chart(lead_end,'reason','Reasons')
plt.title('Reasons for not being interested in the demo');


awareness_end = no_interest_df.query("stage_dropped == 'awareness'")
plot_bar_chart(awareness_end,'reason','Reasons')
plt.title('Reasons for not being interested in considering the service');


consider_end = no_interest_df.query("stage_dropped == 'consider'")
plot_bar_chart(consider_end,'reason','Reasons')
plt.title('Reasons for not being interested for conversion');


afford = no_interest_df[no_interest_df['reason']=="Can't afford"]
afford_details = pd.merge(afford,leads_df,on='lead_id')
plot_bar_chart(afford_details,'current_education','Current Education',y=True)
plt.title("Current Education for leads who cited affordability");


plt.hist(x='age',data=afford_details,bins=6)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title("Distribution of age of leads who cited affordability");


parent_count = afford_details['parent_occupation'].value_counts()
plt.pie(parent_count,labels=parent_count.index,autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p,p * sum(parent_count)/100))
plt.title("Parent Occupation of leads who dropped out due to affordability");


offline_class = no_interest_df[no_interest_df['reason']=='Wants offline classes']
offline_class_details = pd.merge(leads_df,offline_class,on='lead_id')
plot_bar_chart(offline_class_details,'current_city','Current City',ninety=False)
plt.title("City of residence of leads who preffer offline classes ");


plt.figure(figsize=(8,6))
plot_bar_chart(leads_df,'current_city',"Current City",ninety=False)
plt.title('Current City of residence of all the leads')

Text(0.5, 1.0, 'Current City of residence of all the leads')


no_interest_domain = no_interest_df[no_interest_df['reason']=='Not Interested in domain']
no_interest_domain_details = pd.merge(leads_df,no_interest_domain,on='lead_id')
plot_bar_chart(no_interest_domain_details,'current_education',"Current Education",y=True)
plt.title("Current Education of leads who dropped out because \nthe domain wasn't interesting");


converted_leads_id = interactions_df[interactions_df['lead_stage']=='conversion']['lead_id'].unique()
print(f"{round((converted_leads_id.shape[0] / leads_df.shape[0])*100)}% of leads are successfully converted")

18% of leads are successfully converted


(no_interest_df.stage_dropped.value_counts() / leads_df.shape[0]) * 100

lead         45.810056
awareness    22.067039
consider     14.245810
Name: stage_dropped, dtype: float64


dropped_leads_demo = pd.merge(demos_df,no_interest_df,on='lead_id')
converted_leads = interactions_df[interactions_df['lead_stage']=='conversion'][['lead_id']]
converted_leads['stage_dropped'] = "Conversion"
converted_leads_demos = pd.merge(demos_df,converted_leads,on='lead_id')
demo_x = pd.concat([dropped_leads_demo,converted_leads_demos])
plt.figure(figsize=(10,6))
sns.boxplot(x='stage_dropped',y='watched_percentage',data=demo_x,color=base_color)
labels = [
    "Dropped at awareness",
    "Dropped at consideration",
    "Successfully Converted"
]
plt.xticks([0,1,2],labels)
plt.ylabel('Watch Percentage')
plt.xlabel('Stage status')
plt.title("Demo Video Engagement");


sns.boxplot(x='language',y='watched_percentage',data=demos_df,color=base_color)
plt.xlabel("Demo Language")
plt.ylabel("Watched Percentage")
plt.title("Demo Video Engagement");


dropped_leads_details = pd.merge(leads_df,no_interest_df,on='lead_id')
converted_leads = interactions_df[interactions_df['lead_stage']=='conversion'][['lead_id']]
converted_leads['converted'] = "Yes"
dropped_leads_details['converted'] = "No"
converted_leads_details = pd.merge(leads_df,converted_leads,on='lead_id')
details_x = pd.concat([dropped_leads_details,converted_leads_details])
plt.figure(figsize=(10,6))
details_x_order = details_x['lead_gen_source'].value_counts().index
sns.countplot(x='lead_gen_source',hue='converted',data=details_x,hue_order=['Yes','No'],order=details_x_order)
plt.ylabel('Number of leads')
plt.xlabel('Source of Lead')
plt.title("Source from which converted leads were generated from");


gender_count = converted_leads_details['gender'].value_counts()
plt.pie(gender_count,labels=gender_count.index,autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p,p * sum(gender_count)/100))
plt.title("Gender Composition of Successful Converted leads");

Edtech Analysis¶

Introduction¶

Overview of the dataset¶

Loading libraries¶

Load in the cleaned data¶

Analysis¶

Why are leads dropping out¶

Do leads find the demos engaging?¶

Converted Leads Details¶