Created
March 24, 2025 23:01
-
-
Save antoniosb/07168c0b9ff3bbe987f3bcdb10af2ea7 to your computer and use it in GitHub Desktop.
PL/pgSQL function that parses PDF timestamps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION parse_pdf_date(pdf_date TEXT) RETURNS TIMESTAMP WITH TIME ZONE AS $$ | |
DECLARE | |
timestamp_part TEXT; | |
tz_offset TEXT; | |
clean_tz_offset TEXT; | |
parsed_timestamp TIMESTAMP; | |
BEGIN | |
-- Return NULL if the input is NULL or does not start with 'D:' | |
IF pdf_date IS NULL OR pdf_date !~ '^D:\d+' THEN | |
RETURN NULL; | |
END IF; | |
-- Remove 'D:' prefix | |
pdf_date := substring(pdf_date FROM 3); | |
-- Handle 'Z00'00'' case by replacing with 'Z' | |
pdf_date := regexp_replace(pdf_date, 'Z00''00''$', 'Z'); | |
-- Extract timestamp part (first 14 digits) | |
IF length(pdf_date) >= 14 THEN | |
timestamp_part := substring(pdf_date FROM 1 FOR 14); | |
ELSE | |
RETURN NULL; -- Invalid format (too short) | |
END IF; | |
-- Validate timestamp before parsing | |
IF timestamp_part !~ '^\d{14}$' THEN | |
RETURN NULL; -- Invalid timestamp format | |
END IF; | |
BEGIN | |
parsed_timestamp := to_timestamp(timestamp_part, 'YYYYMMDDHH24MISS'); | |
EXCEPTION | |
WHEN others THEN | |
RETURN NULL; -- Catch invalid dates (e.g., month > 12) | |
END; | |
-- Handle timezone if present | |
IF pdf_date ~ '^\d{14}[+-]\d{2}''\d{2}''?$' THEN | |
tz_offset := substring(pdf_date FROM 15); | |
clean_tz_offset := regexp_replace(tz_offset, '''', ':', 'g'); | |
clean_tz_offset := regexp_replace(clean_tz_offset, ':$', ''); -- Remove trailing colon | |
-- Validate timezone format before applying | |
IF clean_tz_offset ~ '^[+-]\d{2}:\d{2}$' THEN | |
parsed_timestamp := parsed_timestamp AT TIME ZONE ('UTC' || clean_tz_offset); | |
END IF; | |
END IF; | |
RETURN parsed_timestamp; | |
END; | |
$$ LANGUAGE plpgsql IMMUTABLE; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment