jgomo3 · May 10, 2025 13:18
diff --git a/split-into-words.clj b/split-into-words.clj
 (def split-into-words (partial re-seq #"\w+"))

 (comment
  ;; split-into-words, implemented with the trivial \w+ regular
  ;; expression, it works fine in English:
  
  (split-into-words "Have a nice day.")
  ;; => ("Have" "a" "nice" "day")

  ;; But it fails with other languages, like in Spanish.  In the
  ;; following example, the word "día" is splited into "d" and "a"

  (split-into-words "Que tenga un buen día")
  ;; => ("Que" "tenga" "un" "buen" "d" "a")
 )

 (def split-into-words (partial re-seq #"\p{IsAlphabetic}+"))


 (comment
  ;; Now, split-into-words, implemented with the regular expression
  ;; with Unicode Support, works correctly with other languages like
  ;; Spanish:
  
  (split-into-words "Que tenga un buen día")
  ;; => ("Que" "tenga" "un" "buen" "día")
 )
diff --git a/split-into-words.es.clj b/split-into-words.es.clj
 ;; Demostración del uso de expresiones regulares con soporte de Unicode.
 ;; Divide un texto en palabras.

 (def split-into-words (partial re-seq #"\w+"))

 (comment
  ;; split-into-words, implementado con la expression regular trivial
  ;; \w+, funciona bien en inglés:
  
  (split-into-words "Have a nice day.")
  ;; => ("Have" "a" "nice" "day")

  ;; Pero falla con otros lenguajes, como el español.  En el siguiente
  ;; ejemplo, la palabra día es dividida en «d» y en «a»:

  (split-into-words "Que tenga un buen día")
  ;; => ("Que" "tenga" "un" "buen" "d" "a")
 )

 (def split-into-words (partial re-seq #"\p{IsAlphabetic}+"))


 (comment
  ;; En cambio ahora, split-into-words, implementado con una
  ;; expression regular usando el soporte a Unicode, trabaja
  ;; conrrectamente con lenguajes como el español:
  
  (split-into-words "Que tenga un buen día")
  ;; => ("Que" "tenga" "un" "buen" "día")
 )
	(def split-into-words (partial re-seq #"\w+"))

	(comment
	;; split-into-words, implemented with the trivial \w+ regular
	;; expression, it works fine in English:

	(split-into-words "Have a nice day.")
	;; => ("Have" "a" "nice" "day")

	;; But it fails with other languages, like in Spanish. In the
	;; following example, the word "día" is splited into "d" and "a"

	(split-into-words "Que tenga un buen día")
	;; => ("Que" "tenga" "un" "buen" "d" "a")
	)

	(def split-into-words (partial re-seq #"\p{IsAlphabetic}+"))


	(comment
	;; Now, split-into-words, implemented with the regular expression
	;; with Unicode Support, works correctly with other languages like
	;; Spanish:

	(split-into-words "Que tenga un buen día")
	;; => ("Que" "tenga" "un" "buen" "día")
	)
	;; Demostración del uso de expresiones regulares con soporte de Unicode.
	;; Divide un texto en palabras.

	(def split-into-words (partial re-seq #"\w+"))

	(comment
	;; split-into-words, implementado con la expression regular trivial
	;; \w+, funciona bien en inglés:

	(split-into-words "Have a nice day.")
	;; => ("Have" "a" "nice" "day")

	;; Pero falla con otros lenguajes, como el español. En el siguiente
	;; ejemplo, la palabra día es dividida en «d» y en «a»:

	(split-into-words "Que tenga un buen día")
	;; => ("Que" "tenga" "un" "buen" "d" "a")
	)

	(def split-into-words (partial re-seq #"\p{IsAlphabetic}+"))


	(comment
	;; En cambio ahora, split-into-words, implementado con una
	;; expression regular usando el soporte a Unicode, trabaja
	;; conrrectamente con lenguajes como el español:

	(split-into-words "Que tenga un buen día")
	;; => ("Que" "tenga" "un" "buen" "día")
	)