lindeloev · January 18, 2022 18:26
diff --git a/wordle_solver.R b/wordle_solver.R
 # Use these functions to make smart guesses for Wordle (https://www.powerlanguage.co.uk/wordle/)
 # find_word() returns words that satisfies the wordle feedback. Start with `possible_words`, i.e., all 5-letter english words.
 # next_word() returns words that are most likely to result in green letters.
 #
 # A pretty good strategy on the next_word() output is to use "pathfinder" for the first two words and "guess" thereafter, picking the first commonly-known word.

 #############
 # FUNCTIONS #
 #############

 #' Detect the letter sequence that is likely to contain most green letters
 #' 
 #' @param words A character vector of words
 #' @param strategy 
 #'  * `"guess"`: only optimize for green letters.
 #'  * `"learn"`: optimize for green AND yellow letters.
 #' @param n How many top hits to show (from best to worse)
 #' @return A vector of `n` words (best first)
 next_word = function(words, strategy = "guess", n = 3) {
  # Get frequency of every letter at every position
  letter_pos_frequency = do.call(rbind, strsplit(words, "")) |>
    as.data.frame() |>
    lapply(table)
  
  # Score each word as the sum of words with letters in these positions
  df_scores = data.frame(word = words, score = 0)
  wordlength = unique(nchar(words))
  stopifnot("All words must have the same length" = length(wordlength) == 1)
  for (i in seq_len(wordlength)) {
    letter_i = substr(words, i, i)
    df_scores$score = df_scores$score + letter_pos_frequency[[i]][letter_i]
    
    # Also weight in yellow characters for "learn" strategy, i.e.,correct
    # characters in the wrong position.
    if (strategy == "learn") {
      other_letter_pos_frequency = paste0(substr(words, 1, i-1), substr(words, i+1, wordlength)) |>
        strsplit("") |>
        unlist() |>
        table()
      
      # Give identification of yellow letters half the info-weight of green letters.
      yellow_weight = 0.5 / wordlength
      df_scores$score = df_scores$score + other_letter_pos_frequency[letter_i] * yellow_weight
    }
  }
  
  # Return the best guess- and pathfinder words
  df_ordered = df_scores[order(-df_scores$score), ]
  
  if (strategy == "guess") {
    head(df_ordered$word, n)
  } else if (strategy == "learn") {
    only_unique_characters = df_ordered$word |>
      strsplit("") |>
      lapply(\(x) length(unique(x)) == length(x)) |>
      unlist()
    head(df_ordered$word[only_unique_characters], n)
  }
 }

  
 #' Find words that fulfill Wordle criteria
 #' 
 #' @param words Vector of possible words at this step, e.g., `c("goats", "horse")`.
 #' @param green Green characters in their correct position, e.g., `"s???e"`. 
 #'   Write ? where there are no green characters.
 #' @param grey Gray characters, e.g., `"car"`
 #' @param yellows Yellow characters in their correct position, e.g., `c("???es", "??i??")`.
 #' @return A vector of words
 find_words = function(words, green = "?????", grey = "", yellows = c()) {
  # GREEN: Keep words matching green letters in their position
  regex_green = paste0("^", gsub("?", "[a-z]", tolower(green), fixed = TRUE), "$")
  words_remaining = words[grepl(regex_green, words)]
  
  # GREY: Remove words with grey letters
  if (nchar(grey) > 0) {
    grey_regex = gsub("(?<=.)(?=.)", "|", tolower(grey), perl = TRUE)  # split characters by |
    words_remaining = words_remaining[!grepl(grey_regex, words_remaining)]
  }
  
  # YELLOW
  for (yellow in yellows) {
    letters_i = strsplit(yellow, "") |> unlist()
    
    for (letter in letters_i[letters_i != "?"]) {
      # Yellow letter must not be in the entered position
      letter_position = which(letters_i == letter)
      illegal_words = substr(words_remaining, letter_position, letter_position) == letter
      words_remaining = words_remaining[!illegal_words]
      
      # Yellow letter must be present
      legal_words = grepl(letter, words_remaining)
      words_remaining = words_remaining[legal_words]
      
    }
  }
  
  words_remaining
 }



 ############
 # APPLY IT #
 ############
 # Vector of all English words
 all_words = read.csv("https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt", header = FALSE, col.names = "word")$word
 possible_words = all_words[nchar(all_words) == 5]

 # 2022-01-01: Third guess
 words = find_words(possible_words)
 next_word(words, "learn")  # I guess CARES

 words = find_words(words, green = "????s", grey = "ca", yellows = c("??re?"))
 next_word(words, "learn")  # I guess TIERS

 words = find_words(words, green = "????s", grey = "cati", yellows = c("??re?", "??er?"))
 next_word(words, "guess", n = 100)  # I guess REBUS


 # 2022-01-02: Third guess
 words = find_words(possible_words)
 next_word(words, "learn")  # I guess CARES

 words = find_words(words, grey = "care", yellows = c("????s"))
 next_word(words, "learn")  # I guess SOILY

 words = find_words(words, green = "?o???", grey = "careily", yellows = c("????s", "s????"))
 next_word(words, "guess", n = 100)  # I guess BOOST


 # 2022-01-03: Fourth guess
 words = find_words(possible_words)
 next_word(words, "learn")  # I guess CARES

 words = find_words(words, green = "????s", grey = "cae", yellows = c("??r??"))
 next_word(words, "learn")  # I guess GROTS

 words = find_words(words, green = "?r??s", grey = "caego", yellows = c("??r??", "???t?"))
 next_word(words, "guess", n = 100)  # I guess TRIMS

 words = find_words(words, green = "tr??s", grey = "caegoim", yellows = c("??r??", "???t?"))
 next_word(words, "guess", n = 100)  # I guess TRUSS


 # 2022-01-04: Third guess
 words = find_words(possible_words)
 next_word(words, "learn")  # I guess CARES

 words = find_words(words, grey = "car", yellows = c("???es"))
 next_word(words, "learn")  # I guess STILE

 words = find_words(words, green = "s???e", grey = "cartl", yellows = c("???es", "??i??"))
 next_word(words, "guess", n = 100)  # I guess SIEGE


 # 2022-01-05: Third guess
 words = find_words(possible_words)
 next_word(words, "learn")  # I guess CARES

 words = find_words(words, green = "???e?", grey = "cas", yellow = c("??r??"))
 next_word(words, "learn")  # DOTER

 words = find_words(words, green = "???er", grey = "casdo", yellow = c("??r??", "??t??"))
 next_word(words, "guess", n = 100)  # TIGER


 # 2022-01-06: third guess
 words = find_words(possible_words)
 next_word(words, "learn")  # CARES

 words = find_words(words, green = "?a???", grey = "cres")
 next_word(words, "learn")  # MANLY

 words = find_words(words, green = "?an??", grey = "cresmy", yellow = c("???l?"))
 next_word(words, "guess", n = 100)  # BANAL
	# Use these functions to make smart guesses for Wordle (https://www.powerlanguage.co.uk/wordle/)
	# find_word() returns words that satisfies the wordle feedback. Start with `possible_words`, i.e., all 5-letter english words.
	# next_word() returns words that are most likely to result in green letters.
	#
	# A pretty good strategy on the next_word() output is to use "pathfinder" for the first two words and "guess" thereafter, picking the first commonly-known word.

	#############
	# FUNCTIONS #
	#############

	#' Detect the letter sequence that is likely to contain most green letters
	#'
	#' @param words A character vector of words
	#' @param strategy
	#' * `"guess"`: only optimize for green letters.
	#' * `"learn"`: optimize for green AND yellow letters.
	#' @param n How many top hits to show (from best to worse)
	#' @return A vector of `n` words (best first)
	next_word = function(words, strategy = "guess", n = 3) {
	# Get frequency of every letter at every position
	letter_pos_frequency = do.call(rbind, strsplit(words, "")) \|>
	as.data.frame() \|>
	lapply(table)

	# Score each word as the sum of words with letters in these positions
	df_scores = data.frame(word = words, score = 0)
	wordlength = unique(nchar(words))
	stopifnot("All words must have the same length" = length(wordlength) == 1)
	for (i in seq_len(wordlength)) {
	letter_i = substr(words, i, i)
	df_scores$score = df_scores$score + letter_pos_frequency[[i]][letter_i]

	# Also weight in yellow characters for "learn" strategy, i.e.,correct
	# characters in the wrong position.
	if (strategy == "learn") {
	other_letter_pos_frequency = paste0(substr(words, 1, i-1), substr(words, i+1, wordlength)) \|>
	strsplit("") \|>
	unlist() \|>
	table()

	# Give identification of yellow letters half the info-weight of green letters.
	yellow_weight = 0.5 / wordlength
	df_scores$score = df_scores$score + other_letter_pos_frequency[letter_i] * yellow_weight
	}
	}

	# Return the best guess- and pathfinder words
	df_ordered = df_scores[order(-df_scores$score), ]

	if (strategy == "guess") {
	head(df_ordered$word, n)
	} else if (strategy == "learn") {
	only_unique_characters = df_ordered$word \|>
	strsplit("") \|>
	lapply(\(x) length(unique(x)) == length(x)) \|>
	unlist()
	head(df_ordered$word[only_unique_characters], n)
	}
	}


	#' Find words that fulfill Wordle criteria
	#'
	#' @param words Vector of possible words at this step, e.g., `c("goats", "horse")`.
	#' @param green Green characters in their correct position, e.g., `"s???e"`.
	#' Write ? where there are no green characters.
	#' @param grey Gray characters, e.g., `"car"`
	#' @param yellows Yellow characters in their correct position, e.g., `c("???es", "??i??")`.
	#' @return A vector of words
	find_words = function(words, green = "?????", grey = "", yellows = c()) {
	# GREEN: Keep words matching green letters in their position
	regex_green = paste0("^", gsub("?", "[a-z]", tolower(green), fixed = TRUE), "$")
	words_remaining = words[grepl(regex_green, words)]

	# GREY: Remove words with grey letters
	if (nchar(grey) > 0) {
	grey_regex = gsub("(?<=.)(?=.)", "\|", tolower(grey), perl = TRUE) # split characters by \|
	words_remaining = words_remaining[!grepl(grey_regex, words_remaining)]
	}

	# YELLOW
	for (yellow in yellows) {
	letters_i = strsplit(yellow, "") \|> unlist()

	for (letter in letters_i[letters_i != "?"]) {
	# Yellow letter must not be in the entered position
	letter_position = which(letters_i == letter)
	illegal_words = substr(words_remaining, letter_position, letter_position) == letter
	words_remaining = words_remaining[!illegal_words]

	# Yellow letter must be present
	legal_words = grepl(letter, words_remaining)
	words_remaining = words_remaining[legal_words]

	}
	}

	words_remaining
	}



	############
	# APPLY IT #
	############
	# Vector of all English words
	all_words = read.csv("https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt", header = FALSE, col.names = "word")$word
	possible_words = all_words[nchar(all_words) == 5]

	# 2022-01-01: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "????s", grey = "ca", yellows = c("??re?"))
	next_word(words, "learn") # I guess TIERS

	words = find_words(words, green = "????s", grey = "cati", yellows = c("??re?", "??er?"))
	next_word(words, "guess", n = 100) # I guess REBUS


	# 2022-01-02: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, grey = "care", yellows = c("????s"))
	next_word(words, "learn") # I guess SOILY

	words = find_words(words, green = "?o???", grey = "careily", yellows = c("????s", "s????"))
	next_word(words, "guess", n = 100) # I guess BOOST


	# 2022-01-03: Fourth guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "????s", grey = "cae", yellows = c("??r??"))
	next_word(words, "learn") # I guess GROTS

	words = find_words(words, green = "?r??s", grey = "caego", yellows = c("??r??", "???t?"))
	next_word(words, "guess", n = 100) # I guess TRIMS

	words = find_words(words, green = "tr??s", grey = "caegoim", yellows = c("??r??", "???t?"))
	next_word(words, "guess", n = 100) # I guess TRUSS


	# 2022-01-04: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, grey = "car", yellows = c("???es"))
	next_word(words, "learn") # I guess STILE

	words = find_words(words, green = "s???e", grey = "cartl", yellows = c("???es", "??i??"))
	next_word(words, "guess", n = 100) # I guess SIEGE


	# 2022-01-05: Third guess
	words = find_words(possible_words)
	next_word(words, "learn") # I guess CARES

	words = find_words(words, green = "???e?", grey = "cas", yellow = c("??r??"))
	next_word(words, "learn") # DOTER

	words = find_words(words, green = "???er", grey = "casdo", yellow = c("??r??", "??t??"))
	next_word(words, "guess", n = 100) # TIGER


	# 2022-01-06: third guess
	words = find_words(possible_words)
	next_word(words, "learn") # CARES

	words = find_words(words, green = "?a???", grey = "cres")
	next_word(words, "learn") # MANLY

	words = find_words(words, green = "?an??", grey = "cresmy", yellow = c("???l?"))
	next_word(words, "guess", n = 100) # BANAL