Created
November 10, 2019 10:05
-
-
Save danielfone/2b2a084c5cf8baa22d460b86a7e59f5f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with recursive | |
-- extract all the messages as rows from the several `conversations.history` | |
-- API responses in the `slack_history` table | |
messages as ( | |
select jsonb_array_elements(response->'messages') as message from markov.slack_history | |
), | |
-- remove @heres, @names, exclude bot messages, etc | |
cleaned_messages as ( | |
select message->>'ts' as id, regexp_replace(message->>'text', '\<[!@#]\S+\> ?', '', 'g') as text | |
from messages | |
where message->'subtype' is null | |
), | |
-- split each message into a text array | |
message_word_lists as ( | |
select id, string_to_array(text, ' ') as list | |
from cleaned_messages | |
), | |
-- split all words into separate rows with a column for the original message | |
-- and a number that represents the word's sequence in the message. includes a | |
-- NULL word at the start and end of each message. This is initiator and | |
-- terminator for the chain. | |
word_indexes as ( | |
select id, list[sequence] as word, sequence | |
from (select id, list, generate_series(0, array_length(list, 1)+1) as sequence from message_word_lists) indicies | |
), | |
-- self-join our words on neighbouring sequences to produce rows containing | |
-- words that occured in pairs in the original message (bigrams) | |
-- e.g. | Hello | how | | |
-- | how | are | | |
-- | are | you? | | |
bigrams as ( | |
select lefts.word as left, rights.word as right | |
from word_indexes lefts | |
join word_indexes rights on lefts.sequence = rights.sequence - 1 and lefts.id = rights.id | |
), | |
-- 1. start with a random null-initiated bigram, this is the start of a | |
-- message | |
-- 2. shuffle the bigrams and pick a random word which has followed the first | |
-- word before. Probability of choice is determined by number of occurances | |
-- in the text, since it's random sorting | |
-- 3. Finish when we select a word with a null RHS | |
-- | |
markov_chain(word) as ( | |
(select bigrams.right as word from bigrams where bigrams.left is null order by random() limit 1) | |
union all | |
(select bigrams.right as word from bigrams join markov_chain on markov_chain.word = bigrams.left order by random() limit 1) | |
) | |
--select * from messages | |
--select * from cleaned_messages | |
--select * from message_word_lists | |
--select * from word_indexes | |
select string_agg(word, ' ') from markov_chain | |
--select * from bigrams where bigrams.left = 'perfect.' | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment