# %% # Create Alteryx input input = [{'date_field':'29-01-2020,dd.mm.yyyy'}, {'date_field':'03/26/1920,mm/dd/yyyy'}, {'date_field':'11/16/2008,mm-dd-yyyy'}, {'date_field':'15/12/2005,dd/mm/yyyy'}, {'date_field':'1996-15-03,yyyy-dd-mm'} ] input_df = spark.createDataFrame(input) display(input_df) # %% # Split date_field into two columns date_string and date_format import pyspark.sql.functions as F input_df_split = ( input_df.withColumn('date_string',F.substring(F.col('date_field'),1,10)) .withColumn('date_format',F.substring(F.col('date_field'),12,10)) ) display(input_df_split) # %% # Create column with formatted date input_df_split.createOrReplaceTempView('input_df_split_view') output_df = spark.sql( """ SELECT * ,CAST( CASE WHEN LEFT(input_df_split_view.date_format,1) = 'd' THEN CONCAT(RIGHT(input_df_split_view.date_string,4),'-',SUBSTRING(input_df_split_view.date_string,4,2),'-',LEFT(input_df_split_view.date_string,2)) WHEN LEFT(input_df_split_view.date_format,1) = 'm' THEN CONCAT(RIGHT(input_df_split_view.date_string,4),'-',LEFT(input_df_split_view.date_string,2),'-',SUBSTRING(input_df_split_view.date_string,4,2)) ELSE CONCAT(LEFT(input_df_split_view.date_string,4),'-',RIGHT(input_df_split_view.date_string,2),'-',SUBSTRING(input_df_split_view.date_string,6,2)) END AS date) AS date_formatted FROM input_df_split_view ORDER BY date_formatted """ ) display(output_df) # %% # Create column with formatted date using python udf def parseDate(date_string_col, date_format_col): return F.when( date_format_col.substr(2, 1) == "d", F.concat(F.substring(date_string_col, 7, 4), F.lit("-"), F.substring(date_string_col, 4, 2), F.lit("-"), F.substring(date_string_col, 1, 2)) ).when( date_format_col.substr(2, 1) == "m", F.concat(F.substring(date_string_col, 7, 4), F.lit("-"), F.substring(date_string_col, 1, 2), F.lit("-"), F.substring(date_string_col, 4, 2)) ).otherwise( F.concat(F.substring(date_string_col, 1, 4), F.lit("-"), F.substring(date_string_col, 6, 2), F.lit("-"), F.substring(date_string_col, 9, 2)) ) output_df_2 = output_df.withColumn('date_formatted_udf',parseDate(F.col('date_string'),F.col('date_format'))) display(output_df_2)