Created
September 20, 2024 08:03
-
-
Save thangarajan8/351ae5e85cc1f23318c9696d7213a976 to your computer and use it in GitHub Desktop.
date_time_diff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
from pyspark.sql.functions import col, udf, expr, hour,format_number | |
from pyspark.sql.types import LongType | |
import pandas as pd | |
# Initialize Spark session | |
spark = SparkSession.builder.appName("DateDifference").getOrCreate() | |
def exclude_weekends_and_jan1(start_date, end_date): | |
# Create a date range | |
date_range = pd.date_range(start=start_date, end=end_date, freq='B') | |
# Exclude January 1st | |
date_range = date_range[date_range != pd.Timestamp(f"{start_date.year}-01-01")] | |
return len(date_range) # Return the count of business days | |
# Register UDF | |
exclude_udf = udf(exclude_weekends_and_jan1, LongType()) | |
def calculate_difference(df, timestamp_col, date_col): | |
# Convert date column to timestamp | |
df = df.withColumn(date_col, col(date_col).cast("timestamp")) | |
# Add a new column for the difference in business days | |
df_with_diff = df.withColumn("business_days_diff", exclude_udf(col(date_col), col(timestamp_col))) | |
df_with_diff =df_with_diff.withColumn("hdiff", | |
hour(col("timestamp"))- hour(col("date"))) | |
df_with_diff = df_with_diff.withColumn("h1", (col("business_days_diff") + col("hdiff") / 100).cast("float")) | |
return df_with_diff.withColumn("formatted_result", format_number(col("h1"), 2)) | |
# Example usage | |
data = [(1, '2024-09-20 10:00:00', '2024-09-15'), | |
(2, '2024-09-22 15:30:00', '2024-01-18')] | |
columns = ["id", "timestamp", "date"] | |
df = spark.createDataFrame(data, columns) | |
# Convert timestamp column to appropriate type | |
df = df.withColumn("timestamp", col("timestamp").cast("timestamp")) | |
# Calculate differences | |
result_df = calculate_difference(df, "timestamp", "date") | |
result_df.show(truncate=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment