Skip to content

Instantly share code, notes, and snippets.

@ShikiOkasaka
Created March 28, 2025 05:48
Show Gist options
  • Save ShikiOkasaka/8955b08b7b6acc0f7a988c9cdfb60148 to your computer and use it in GitHub Desktop.
Save ShikiOkasaka/8955b08b7b6acc0f7a988c9cdfb60148 to your computer and use it in GitHub Desktop.
複文をチェックするツール
#!/usr/bin/env python
#
# Copyright 2025 Esrille Inc.
# SPDX-License-Identifier: Apache-2.0
import getopt
import spacy
import sys
IAA = '\uFFF9' # IAA (INTERLINEAR ANNOTATION ANCHOR)
IAS = '\uFFFA' # IAS (INTERLINEAR ANNOTATION SEPARATOR)
IAT = '\uFFFB' # IAT (INTERLINEAR ANNOTATION TERMINATOR)
nlp = spacy.load('ja_ginza_bert_large')
def get_plain_text(text):
plain = ''
in_ruby = False
for c in text:
if c == IAA:
in_ruby = False
elif c == IAS:
in_ruby = True
elif c == IAT:
in_ruby = False
elif not in_ruby:
plain += c
return plain
def is_term(token):
if token.dep_ in ('acl', 'advcl', 'csubj', 'dep', 'nsubj', 'obl'):
return True
return False
def is_dependent_clause(token, has_term):
if not has_term:
return False
if token.dep_ in ('acl', 'ccomp', 'csubj', 'dep'):
return True
if token.dep_ == 'advcl' and token.tag_ != '副詞':
return True
def is_clause(token):
if token.dep_ in ('ROOT', 'acl', 'advcl', 'ccomp', 'csubj', 'dep'):
return True
return False
def in_order(token):
text = ''
has_term = False
children = list(token.children)
i = 0
while i < len(children) and children[i].i < token.i:
child = children[i]
has_term |= is_term(child)
text += in_order(child)
i += 1
text += token.text
while i < len(children):
child = children[i]
has_term |= is_term(child)
text += in_order(child)
i += 1
if is_dependent_clause(token, has_term):
text = f'({text})'
if is_clause(token.head):
if token.i < token.head.i:
text += '/'
return text
def segment(sentence, verbose=False):
if not sentence:
return sentence
doc = nlp(sentence)
if verbose:
print(sentence)
for token in doc[0].sent:
indent = ' ' * len(list(token.ancestors))
print(f'{indent}{token.text} ({token.pos_}, {token.tag_})'
f'→ {token.head.text} ({token.dep_})')
print()
return in_order(doc[0].sent.root)
def print_help(v=0):
print('-h, --help show this message.')
print('-v, --verbose display verbose output.')
sys.exit(v)
def main():
verbose = False
try:
opts, args = getopt.getopt(sys.argv[1:], 'hv', ['help', 'verbose'])
except getopt.GetoptError:
print_help(1)
for o, a in opts:
if o in ('-h', '--help'):
print_help(0)
elif o in ('-v', '--verbose'):
verbose = True
for line in sys.stdin:
text = get_plain_text(line.strip())
print(segment(text, verbose))
if __name__ == '__main__':
main()
@ShikiOkasaka
Copy link
Author

このツールに関する記事: 「ソフトウェアで複文をチェックする

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment