Created
March 11, 2025 11:59
-
-
Save ShikiOkasaka/53776e7712168ad4b57ba8430b179c55 to your computer and use it in GitHub Desktop.
GiNZA v5 Transformersモデルで文を修飾部と述部にわけてみる
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import spacy | |
nlp = spacy.load('ja_ginza_electra') | |
def segment(sentence): | |
doc = nlp(sentence) | |
root_token = doc[0].sent.root | |
root_tokens = [root_token.text] | |
root_printed = False | |
for child in root_token.children: | |
if child.dep_ in ('aux', 'cop', 'mark', 'punct', 'compound'): | |
if root_token.i < child.i: | |
root_tokens.extend([token.text for token in child.subtree]) | |
else: | |
root_tokens = [token.text for token in child.subtree] + root_tokens | |
else: | |
subtree_tokens = [token.text for token in child.subtree] | |
if root_token.i < child.i: | |
print(f"/{''.join(root_tokens)}", end='') | |
root_printed = True | |
print(f"/{''.join(subtree_tokens)}", end='') | |
if not root_printed: | |
print(f"/{''.join(root_tokens)}", end='') | |
print('/') | |
# 例文は『日本語作文術』野内良三 2010 (p.54) から。 | |
def main(): | |
segment("明智は、安楽イスのクッションにふかぶかと身をしずめ、辻野氏におとらぬ、にこやかな顔で答えました。") | |
segment("辻野氏のなにげないことばには、おそろしい力がこもっていました。") | |
segment("興奮のために、イスのひじ掛けにのせた左手の先が、かすかにふるえていました。") | |
segment("明智は平然として、このおどろくべきことばを語りました。") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
実行結果