# %% [markdown] # Alteryx weekly challenge 417 # %% [markdown] # The below scrip works with pre-condition that Alteryx input is loaded as dataframe from the local disk (the below path would have to be replaced individually) # %% # Import library used fro data wrangling import pandas as pd # %% # Load in Alteryx input shifts_df = pd.read_csv(r"C:\Users\rpospisil002\Downloads\awc417_input.csv") display(shifts_df) # %% # Parse out individual columns and name them shifts_df['Employee_ID'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[0] shifts_df['Shift_Dates'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[1] shifts_df['Shift_Duration'] = shifts_df['Employee_ID,Shift Dates,Shift Duration'].str.split(',').str[2] # Convert the numeric field to integer shifts_df['Shift_Duration'] = shifts_df['Shift_Duration'].astype('int64') display(shifts_df) # %% # Declare function to create flag for shift above 4 hours def flagShiftAboveFourHours(x): if x > 4: return 'Y' else: return 'N' # Apply the function and create a new column in dataframe shifts_df['Above_4'] = shifts_df['Shift_Duration'].apply(lambda x: flagShiftAboveFourHours(x)) display(shifts_df) # %% # Count shifts above 4 hours shifts_above_4_df = shifts_df[shifts_df['Above_4']=='Y'] # Flag used to filter relevant rows shifts_above_4_agg_df = shifts_above_4_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee shifts_above_4_agg_df = shifts_above_4_agg_df.rename(columns={'Shift_Dates':'Above_4'}) # make the column self-explanatory display(shifts_above_4_agg_df) # %% # Count shifts below 4 hours shifts_below_4_df = shifts_df[shifts_df['Above_4']=='N'] # Flag used to filter relevant rows shifts_below_4_agg_df = shifts_below_4_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee shifts_below_4_agg_df = shifts_below_4_agg_df.rename(columns={'Shift_Dates':'Below_4'}) # make the column self-explanatory display(shifts_below_4_agg_df) # %% # Count total number of shifts shifts_agg_df = shifts_df.groupby('Employee_ID').agg({'Shift_Dates':'count'}).reset_index() # count of dates by employee shifts_agg_df = shifts_agg_df.rename(columns={'Shift_Dates':'Working_Days'}) # make the column self-explanatory shifts_agg_df['non_working_days'] = 90 - shifts_agg_df['Working_Days'] # non-working days are a mere subtraction from 3 months worth of days display(shifts_agg_df) # %% # Combine aggregations together and present result result_a = shifts_agg_df.merge(shifts_above_4_agg_df, on='Employee_ID', how='left') # JOIN above 4 hours result_b = result_a.merge(shifts_below_4_agg_df, on='Employee_ID', how='left') # JOIN below 4 hours result_c = result_b.fillna(0) # replace NaN values with zeros result_c['Pct_of_shifts_below_4_from_toal_working_days'] = round(result_c['Below_4'] / result_c['Working_Days'] * 100, 2) # calcualte the percentage column display(result_c)