Created
September 23, 2019 06:06
-
-
Save maxymania/cd592096297954fac6339190b7f27267 to your computer and use it in GitHub Desktop.
Unicode-Splitter in java.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Public Domain! | |
import java.util.Map; | |
import java.util.concurrent.ConcurrentHashMap; | |
import java.util.function.IntFunction; | |
import java.util.function.IntPredicate; | |
import org.jparsec.Parser; | |
import org.jparsec.Parsers; | |
import org.jparsec.pattern.CharPredicate; | |
import org.jparsec.pattern.Pattern; | |
import org.jparsec.pattern.Patterns; | |
/** | |
* | |
* @author simon | |
*/ | |
public class SplitterLib { | |
static final Pattern HS = Patterns.isChar(Character::isHighSurrogate); | |
static final Pattern LS = Patterns.isChar(Character::isLowSurrogate); | |
static final Pattern CODEPT = HS.next(LS); | |
static final Pattern GUARD = HS.next(LS).not(); | |
static<T> Parser<T> matches(boolean b){ | |
if(b) return Parsers.always(); | |
return Parsers.never(); | |
} | |
static Parser<?> many(final IntPredicate ip,final CharPredicate cp){ | |
Parser<?> xs = GUARD.next(Patterns.isChar(cp)).many1().toScanner("[x]+"); | |
Parser<?> xl = CODEPT.toScanner("[X]").source().next(s->matches(ip.test(s.codePointAt(0)))); | |
return Parsers.or(xs,xl).skipMany(); | |
} | |
static Parser<?> many(final IntPredicate ip){ | |
Parser<?> xs = GUARD.next(Patterns.isChar(ip::test)).many1().toScanner("[x]+"); | |
Parser<?> xl = CODEPT.toScanner("[X]").source().next(s->matches(ip.test(s.codePointAt(0)))); | |
return Parsers.or(xs,xl).skipMany(); | |
} | |
static Parser<?> uniBlock(final IntFunction<Parser<?>> func){ | |
Parser<?> cpt = GUARD.next(Patterns.ANY_CHAR).or(CODEPT).toScanner("CODEPT"); | |
return cpt.source().next(s->func.apply(s.codePointAt(0))); | |
} | |
static Parser<?> sameBlock(Character.UnicodeBlock block){ | |
return many(i->Character.UnicodeBlock.of(i)==block); | |
} | |
static Parser<?> sameScript(Character.UnicodeScript block){ | |
return many(i->Character.UnicodeScript.of(i)==block); | |
} | |
static final Map<Character.UnicodeBlock,Parser<?>> MAP_SAME_BLOCK = | |
new ConcurrentHashMap<>(); | |
static final Map<Character.UnicodeScript,Parser<?>> MAP_SAME_SCRIPT = | |
new ConcurrentHashMap<>(); | |
public static final Parser<?> SAME_BLOCK; | |
public static final Parser<?> SAME_SCRIPT; | |
static { | |
IntFunction<Parser<?>> p; | |
p = i->MAP_SAME_BLOCK.computeIfAbsent(Character.UnicodeBlock.of(i), SplitterLib::sameBlock); | |
SAME_BLOCK = uniBlock(p); | |
p = i->MAP_SAME_SCRIPT.computeIfAbsent(Character.UnicodeScript.of(i), SplitterLib::sameScript); | |
SAME_SCRIPT = uniBlock(p); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment