danielfone · November 10, 2019 10:05
diff --git a/markov.sql b/markov.sql
 with recursive

  -- extract all the messages as rows from the several `conversations.history`
  -- API responses in the `slack_history` table
  messages as (
    select jsonb_array_elements(response->'messages') as message from markov.slack_history
  ),
  
  -- remove @heres, @names, exclude bot messages, etc
  cleaned_messages as (
    select message->>'ts' as id, regexp_replace(message->>'text', '\<[!@#]\S+\> ?', '', 'g') as text
    from messages
    where message->'subtype' is null
  ),

  -- split each message into a text array
  message_word_lists as (
    select id, string_to_array(text, ' ') as list
    from cleaned_messages
  ),
  
  -- split all words into separate rows with a column for the original message
  -- and a number that represents the word's sequence in the message. includes a
  -- NULL word at the start and end of each message. This is initiator and
  -- terminator for the chain.
  word_indexes as (
    select id, list[sequence] as word, sequence
    from (select id, list, generate_series(0, array_length(list, 1)+1) as sequence from message_word_lists) indicies
  ),
  
  -- self-join our words on neighbouring sequences to produce rows containing
  -- words that occured in pairs in the original message (bigrams)
  -- e.g. | Hello | how  |
  --      | how   | are  |
  --      | are   | you? | 
  bigrams as (
    select lefts.word as left, rights.word as right
    from word_indexes lefts
      join word_indexes rights on lefts.sequence = rights.sequence - 1 and lefts.id = rights.id
  ),
  
  -- 1. start with a random null-initiated bigram, this is the start of a
  --    message
  -- 2. shuffle the bigrams and pick a random word which has followed the first
  --    word before. Probability of choice is determined by number of occurances
  --    in the text, since it's random sorting
  -- 3. Finish when we select a word with a null RHS
  --
  markov_chain(word) as (
    (select bigrams.right as word from bigrams where bigrams.left is null order by random() limit 1)
    union all
    (select bigrams.right as word from bigrams join markov_chain on markov_chain.word = bigrams.left order by random() limit 1)
  )
  
 --select * from messages
 --select * from cleaned_messages
 --select * from message_word_lists
 --select * from word_indexes
 select string_agg(word, ' ') from markov_chain
 --select * from bigrams where bigrams.left = 'perfect.'
 ;
	with recursive

	-- extract all the messages as rows from the several `conversations.history`
	-- API responses in the `slack_history` table
	messages as (
	select jsonb_array_elements(response->'messages') as message from markov.slack_history
	),

	-- remove @heres, @names, exclude bot messages, etc
	cleaned_messages as (
	select message->>'ts' as id, regexp_replace(message->>'text', '\<[!@#]\S+\> ?', '', 'g') as text
	from messages
	where message->'subtype' is null
	),

	-- split each message into a text array
	message_word_lists as (
	select id, string_to_array(text, ' ') as list
	from cleaned_messages
	),

	-- split all words into separate rows with a column for the original message
	-- and a number that represents the word's sequence in the message. includes a
	-- NULL word at the start and end of each message. This is initiator and
	-- terminator for the chain.
	word_indexes as (
	select id, list[sequence] as word, sequence
	from (select id, list, generate_series(0, array_length(list, 1)+1) as sequence from message_word_lists) indicies
	),

	-- self-join our words on neighbouring sequences to produce rows containing
	-- words that occured in pairs in the original message (bigrams)
	-- e.g. \| Hello \| how \|
	-- \| how \| are \|
	-- \| are \| you? \|
	bigrams as (
	select lefts.word as left, rights.word as right
	from word_indexes lefts
	join word_indexes rights on lefts.sequence = rights.sequence - 1 and lefts.id = rights.id
	),

	-- 1. start with a random null-initiated bigram, this is the start of a
	-- message
	-- 2. shuffle the bigrams and pick a random word which has followed the first
	-- word before. Probability of choice is determined by number of occurances
	-- in the text, since it's random sorting
	-- 3. Finish when we select a word with a null RHS
	--
	markov_chain(word) as (
	(select bigrams.right as word from bigrams where bigrams.left is null order by random() limit 1)
	union all
	(select bigrams.right as word from bigrams join markov_chain on markov_chain.word = bigrams.left order by random() limit 1)
	)

	--select * from messages
	--select * from cleaned_messages
	--select * from message_word_lists
	--select * from word_indexes
	select string_agg(word, ' ') from markov_chain
	--select * from bigrams where bigrams.left = 'perfect.'
	;