Skip to content

Instantly share code, notes, and snippets.

@antoniosb
Created March 24, 2025 23:01
Show Gist options
  • Save antoniosb/07168c0b9ff3bbe987f3bcdb10af2ea7 to your computer and use it in GitHub Desktop.
Save antoniosb/07168c0b9ff3bbe987f3bcdb10af2ea7 to your computer and use it in GitHub Desktop.
PL/pgSQL function that parses PDF timestamps
CREATE OR REPLACE FUNCTION parse_pdf_date(pdf_date TEXT) RETURNS TIMESTAMP WITH TIME ZONE AS $$
DECLARE
timestamp_part TEXT;
tz_offset TEXT;
clean_tz_offset TEXT;
parsed_timestamp TIMESTAMP;
BEGIN
-- Return NULL if the input is NULL or does not start with 'D:'
IF pdf_date IS NULL OR pdf_date !~ '^D:\d+' THEN
RETURN NULL;
END IF;
-- Remove 'D:' prefix
pdf_date := substring(pdf_date FROM 3);
-- Handle 'Z00'00'' case by replacing with 'Z'
pdf_date := regexp_replace(pdf_date, 'Z00''00''$', 'Z');
-- Extract timestamp part (first 14 digits)
IF length(pdf_date) >= 14 THEN
timestamp_part := substring(pdf_date FROM 1 FOR 14);
ELSE
RETURN NULL; -- Invalid format (too short)
END IF;
-- Validate timestamp before parsing
IF timestamp_part !~ '^\d{14}$' THEN
RETURN NULL; -- Invalid timestamp format
END IF;
BEGIN
parsed_timestamp := to_timestamp(timestamp_part, 'YYYYMMDDHH24MISS');
EXCEPTION
WHEN others THEN
RETURN NULL; -- Catch invalid dates (e.g., month > 12)
END;
-- Handle timezone if present
IF pdf_date ~ '^\d{14}[+-]\d{2}''\d{2}''?$' THEN
tz_offset := substring(pdf_date FROM 15);
clean_tz_offset := regexp_replace(tz_offset, '''', ':', 'g');
clean_tz_offset := regexp_replace(clean_tz_offset, ':$', ''); -- Remove trailing colon
-- Validate timezone format before applying
IF clean_tz_offset ~ '^[+-]\d{2}:\d{2}$' THEN
parsed_timestamp := parsed_timestamp AT TIME ZONE ('UTC' || clean_tz_offset);
END IF;
END IF;
RETURN parsed_timestamp;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment