# %% [markdown]
# Alteryx weekly challenge 417

# %% [markdown]
# The below scrip works with pre-condition that Alteryx input is loaded as dataframe from the local disk (the below path would have to be replaced individually)

# %%
# Import library used fro data wrangling
import pandas as pd

# %%
# Load in Alteryx input
shifts_df = pd.read_csv(r"C:\Users\rpospisil002\Downloads\awc417_input.csv")
display(shifts_df)

# %%
# Parse out individual columns and name them
shifts_df['Employee_ID'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[0]
shifts_df['Shift_Dates'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[1]
shifts_df['Shift_Duration'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[2]
# Convert the numeric field to integer
shifts_df['Shift_Duration'] = shifts_df['Shift_Duration'].astype('int64')
display(shifts_df)

# %%
# Declare function to create flag for shift above 4 hours
def flagShiftAboveFourHours(x):
    if x > 4:
        return 'Y'
    else:
        return 'N'
# Apply the function and create a new column in dataframe    
shifts_df['Above_4'] = shifts_df['Shift_Duration'].apply(lambda x: flagShiftAboveFourHours(x))
display(shifts_df)

# %%
# Count shifts above 4 hours
shifts_above_4_df = shifts_df[shifts_df['Above_4']=='Y'] # Flag used to filter relevant rows
shifts_above_4_agg_df = shifts_above_4_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee
shifts_above_4_agg_df = shifts_above_4_agg_df.rename(columns={'Shift_Dates':'Above_4'}) # make the column self-explanatory 
display(shifts_above_4_agg_df)

# %%
# Count shifts below 4 hours
shifts_below_4_df = shifts_df[shifts_df['Above_4']=='N'] # Flag used to filter relevant rows
shifts_below_4_agg_df = shifts_below_4_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee
shifts_below_4_agg_df = shifts_below_4_agg_df.rename(columns={'Shift_Dates':'Below_4'}) # make the column self-explanatory 
display(shifts_below_4_agg_df)

# %%
# Count total number of shifts 
shifts_agg_df = shifts_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee
shifts_agg_df = shifts_agg_df.rename(columns={'Shift_Dates':'Working_Days'}) # make the column self-explanatory
shifts_agg_df['non_working_days'] = 90 - shifts_agg_df['Working_Days'] # non-working days are a mere subtraction from 3 months worth of days
display(shifts_agg_df)

# %%
# Combine aggregations together and present result
result_a = shifts_agg_df.merge(shifts_above_4_agg_df, on='Employee_ID', how='left') # JOIN above 4 hours 
result_b = result_a.merge(shifts_below_4_agg_df, on='Employee_ID', how='left') # JOIN below 4 hours
result_c = result_b.fillna(0) # replace NaN values with zeros 
result_c['Pct_of_shifts_below_4_from_toal_working_days'] = round(result_c['Below_4'] / result_c['Working_Days'] * 100, 2) # calcualte the percentage column
display(result_c)