ShikiOkasaka · March 28, 2025 05:48 · ShikiOkasaka · Mar 28, 2025
diff --git a/segment.py b/segment.py
 #!/usr/bin/env python
 #
 # Copyright 2025 Esrille Inc.
 # SPDX-License-Identifier: Apache-2.0

 import getopt
 import spacy
 import sys

 IAA = '\uFFF9'  # IAA (INTERLINEAR ANNOTATION ANCHOR)
 IAS = '\uFFFA'  # IAS (INTERLINEAR ANNOTATION SEPARATOR)
 IAT = '\uFFFB'  # IAT (INTERLINEAR ANNOTATION TERMINATOR)


 nlp = spacy.load('ja_ginza_bert_large')


 def get_plain_text(text):
    plain = ''
    in_ruby = False
    for c in text:
        if c == IAA:
            in_ruby = False
        elif c == IAS:
            in_ruby = True
        elif c == IAT:
            in_ruby = False
        elif not in_ruby:
            plain += c
    return plain


 def is_term(token):
    if token.dep_ in ('acl', 'advcl', 'csubj', 'dep', 'nsubj', 'obl'):
        return True
    return False


 def is_dependent_clause(token, has_term):
    if not has_term:
        return False
    if token.dep_ in ('acl', 'ccomp', 'csubj', 'dep'):
        return True
    if token.dep_ == 'advcl' and token.tag_ != '副詞':
        return True


 def is_clause(token):
    if token.dep_ in ('ROOT', 'acl', 'advcl', 'ccomp', 'csubj', 'dep'):
        return True
    return False


 def in_order(token):
    text = ''
    has_term = False

    children = list(token.children)
    i = 0
    while i < len(children) and children[i].i < token.i:
        child = children[i]
        has_term |= is_term(child)
        text += in_order(child)
        i += 1
    text += token.text
    while i < len(children):
        child = children[i]
        has_term |= is_term(child)
        text += in_order(child)
        i += 1

    if is_dependent_clause(token, has_term):
        text = f'({text})'

    if is_clause(token.head):
        if token.i < token.head.i:
            text += '/'

    return text


 def segment(sentence, verbose=False):
    if not sentence:
        return sentence
    doc = nlp(sentence)
    if verbose:
        print(sentence)
        for token in doc[0].sent:
            indent = '  ' * len(list(token.ancestors))
            print(f'{indent}{token.text} ({token.pos_}, {token.tag_})'
                  f'→ {token.head.text} ({token.dep_})')
        print()
    return in_order(doc[0].sent.root)


 def print_help(v=0):
    print('-h, --help             show this message.')
    print('-v, --verbose          display verbose output.')
    sys.exit(v)


 def main():
    verbose = False
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hv', ['help', 'verbose'])
    except getopt.GetoptError:
        print_help(1)
    for o, a in opts:
        if o in ('-h', '--help'):
            print_help(0)
        elif o in ('-v', '--verbose'):
            verbose = True

    for line in sys.stdin:
        text = get_plain_text(line.strip())
        print(segment(text, verbose))


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	#
	# Copyright 2025 Esrille Inc.
	# SPDX-License-Identifier: Apache-2.0

	import getopt
	import spacy
	import sys

	IAA = '\uFFF9' # IAA (INTERLINEAR ANNOTATION ANCHOR)
	IAS = '\uFFFA' # IAS (INTERLINEAR ANNOTATION SEPARATOR)
	IAT = '\uFFFB' # IAT (INTERLINEAR ANNOTATION TERMINATOR)


	nlp = spacy.load('ja_ginza_bert_large')


	def get_plain_text(text):
	plain = ''
	in_ruby = False
	for c in text:
	if c == IAA:
	in_ruby = False
	elif c == IAS:
	in_ruby = True
	elif c == IAT:
	in_ruby = False
	elif not in_ruby:
	plain += c
	return plain


	def is_term(token):
	if token.dep_ in ('acl', 'advcl', 'csubj', 'dep', 'nsubj', 'obl'):
	return True
	return False


	def is_dependent_clause(token, has_term):
	if not has_term:
	return False
	if token.dep_ in ('acl', 'ccomp', 'csubj', 'dep'):
	return True
	if token.dep_ == 'advcl' and token.tag_ != '副詞':
	return True


	def is_clause(token):
	if token.dep_ in ('ROOT', 'acl', 'advcl', 'ccomp', 'csubj', 'dep'):
	return True
	return False


	def in_order(token):
	text = ''
	has_term = False

	children = list(token.children)
	i = 0
	while i < len(children) and children[i].i < token.i:
	child = children[i]
	has_term \|= is_term(child)
	text += in_order(child)
	i += 1
	text += token.text
	while i < len(children):
	child = children[i]
	has_term \|= is_term(child)
	text += in_order(child)
	i += 1

	if is_dependent_clause(token, has_term):
	text = f'({text})'

	if is_clause(token.head):
	if token.i < token.head.i:
	text += '/'

	return text


	def segment(sentence, verbose=False):
	if not sentence:
	return sentence
	doc = nlp(sentence)
	if verbose:
	print(sentence)
	for token in doc[0].sent:
	indent = ' ' * len(list(token.ancestors))
	print(f'{indent}{token.text} ({token.pos_}, {token.tag_})'
	f'→ {token.head.text} ({token.dep_})')
	print()
	return in_order(doc[0].sent.root)


	def print_help(v=0):
	print('-h, --help show this message.')
	print('-v, --verbose display verbose output.')
	sys.exit(v)


	def main():
	verbose = False
	try:
	opts, args = getopt.getopt(sys.argv[1:], 'hv', ['help', 'verbose'])
	except getopt.GetoptError:
	print_help(1)
	for o, a in opts:
	if o in ('-h', '--help'):
	print_help(0)
	elif o in ('-v', '--verbose'):
	verbose = True

	for line in sys.stdin:
	text = get_plain_text(line.strip())
	print(segment(text, verbose))


	if __name__ == '__main__':
	main()