Last active
November 27, 2021 18:16
-
-
Save vinhkhuc/e1db899b7d48795d306e8d922166e076 to your computer and use it in GitHub Desktop.
spaCy courses (https://course.spacy.io/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source: https://course.spacy.io/ | |
# =========================== Chapter 1 =========================== # | |
# Import the English language class | |
import spacy | |
from spacy.lang.en import English | |
from spacy.matcher import Matcher | |
# Create the nlp object | |
nlp = English() | |
# Created by processing a string of text with the nlp object | |
doc = nlp("Hello world!") | |
# Iterate over tokens in a Doc | |
for token in doc: | |
print(token.text) | |
span = doc[1:4] | |
print(span.text) | |
doc = nlp("It costs $5.") | |
print('Index: ', [token.i for token in doc]) | |
print('Text: ', [token.text for token in doc]) | |
print('is_alpha:', [token.is_alpha for token in doc]) | |
print('is_punct:', [token.is_punct for token in doc]) | |
print('like_num:', [token.like_num for token in doc]) | |
# Text | |
nlp = spacy.load('en_core_web_sm') | |
doc = nlp("Apple is looking at buying U.K. startup for $1 billion") | |
# Iterate over the predicted entities | |
for ent in doc.ents: | |
# Print the entity text and its label | |
print(ent.text, ent.label_) | |
# Predicting named entities in context | |
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake" | |
# Process the text | |
doc = nlp(text) | |
# Iterate over the entities | |
for ent in doc.ents: | |
# Print the entity text and label | |
print(ent.text, ent.label_) | |
# Get the span for "iPhone X" | |
iphone_x = doc[1:3] | |
# Print the span text | |
print("Missing entity:", iphone_x.text) | |
# Section 11 | |
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake") | |
# Initialize the Matcher with the shared vocabulary | |
matcher = Matcher(nlp.vocab) | |
# Create a pattern matching two tokens: "iPhone" and "X" | |
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}] | |
# Add the pattern to the matcher | |
matcher.add("IPHONE_X_PATTERN", None, pattern) | |
# Use the matcher on the doc | |
matches = matcher(doc) | |
print("Matches:", [doc[start:end].text for match_id, start, end in matches]) | |
# ============= Section 12 - Part 1 ============= # | |
doc = nlp( | |
"After making the iOS update you won't notice a radical system-wide " | |
"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of " | |
"iOS 11's furniture remains the same as in iOS 10. But you will discover " | |
"some tweaks once you delve a little deeper." | |
) | |
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10") | |
pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}] | |
# Add the pattern to the matcher and apply the matcher to the doc | |
matcher.add("IOS_VERSION_PATTERN", None, pattern) | |
matches = matcher(doc) | |
print("Total matches found:", len(matches)) | |
# Iterate over the matches and print the span text | |
for match_id, start, end in matches: | |
print("Match found:", doc[start:end].text) | |
# ============= Section 12 - Part 2 ============= # | |
doc = nlp( | |
"i downloaded Fortnite on my laptop and can't open the game at all. Help? " | |
"so when I was downloading Minecraft, I got the Windows version where it " | |
"is the '.zip' folder and I used the default program to unpack it... do " | |
"I also need to download Winzip?" | |
) | |
# Write a pattern that matches a form of "download" plus proper noun | |
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}] | |
# Add the pattern to the matcher and apply the matcher to the doc | |
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern) | |
matches = matcher(doc) | |
print("Total matches found:", len(matches)) | |
# Iterate over the matches and print the span text | |
for match_id, start, end in matches: | |
print("Match found:", doc[start:end].text) | |
# ============= Section 12 - Part 3 ============= # | |
doc = nlp( | |
"Features of the app include a beautiful design, smart search, automatic " | |
"labels and optional voice responses." | |
) | |
# Write a pattern for adjective plus one or two nouns | |
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}] | |
# Add the pattern to the matcher and apply the matcher to the doc | |
matcher.add("ADJ_NOUN_PATTERN", None, pattern) | |
matches = matcher(doc) | |
print("Total matches found:", len(matches)) | |
# Iterate over the matches and print the span text | |
for match_id, start, end in matches: | |
print("Match found:", doc[start:end].text) | |
# =========================== Chapter 2 =========================== # | |
import json | |
import spacy | |
from spacy.lang.en import English | |
from spacy.matcher import Matcher, PhraseMatcher | |
from spacy.tokens import Span | |
nlp = spacy.load('en_core_web_sm') | |
doc = nlp("coffee") | |
coffee_hash = nlp.vocab.strings["coffee"] | |
coffee_string = nlp.vocab.strings[coffee_hash] | |
###### Section 2 - Part 1 ####### | |
doc = nlp("I have a cat") | |
# Look up the hash for the word "cat" | |
cat_hash = nlp.vocab.strings["cat"] | |
print(cat_hash) | |
# Look up the cat_hash to get the string | |
cat_string = nlp.vocab.strings[cat_hash] | |
print(cat_string) | |
###### Section 2 - Part 2 ####### | |
doc = nlp("David Bowie is a PERSON") | |
person_hash = nlp.vocab.strings["PERSON"] | |
print(person_hash) | |
person_string = nlp.vocab.strings[person_hash] | |
print(person_string) | |
###### Section 7 ######### | |
doc = nlp("Berlin is a nice city") | |
# Iterate over the tokens | |
for token in doc: | |
# Check if the current token is a proper noun | |
if token.pos_ == "PROPN": | |
# Check if the next token is a verb | |
if doc[token.i + 1].pos_ == "VERB": | |
print("Found proper noun before a verb:", token.text) | |
###### Section 8 ########## | |
# # Load a larger model with vectors | |
# print("Loading medium model ...") | |
# nlp = spacy.load('en_core_web_md') | |
# | |
# # Compare two documents | |
# doc1 = nlp("I like fast food") | |
# doc2 = nlp("I like pizza") | |
# print(doc1.similarity(doc2)) | |
####### Section 13 ######### | |
doc = nlp( | |
"Twitch Prime, the perks program for Amazon Prime members offering free " | |
"loot, games and other benefits, is ditching one of its best features: " | |
"ad-free viewing. According to an email sent out to Amazon Prime members " | |
"today, ad-free viewing will no longer be included as a part of Twitch " | |
"Prime for new members, beginning on September 14. However, members with " | |
"existing annual subscriptions will be able to continue to enjoy ad-free " | |
"viewing until their subscription comes up for renewal. Those with " | |
"monthly subscriptions will have access to ad-free viewing until October 15." | |
) | |
# Create the match patterns | |
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}] | |
pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}] | |
# Initialize the Matcher and add the patterns | |
matcher = Matcher(nlp.vocab) | |
matcher.add("PATTERN1", None, pattern1) | |
matcher.add("PATTERN2", None, pattern2) | |
# Iterate over the matches | |
for match_id, start, end in matcher(doc): | |
# Print pattern string name and text of matched span | |
print(doc.vocab.strings[match_id], doc[start:end].text) | |
######### Section 14 ########## | |
with open("exercises/countries.json") as f: | |
COUNTRIES = json.loads(f.read()) | |
nlp = English() | |
doc = nlp("Czech Republic may help Slovakia protect its airspace") | |
matcher = PhraseMatcher(nlp.vocab) | |
# Create pattern Doc objects and add them to the matcher | |
# This is the faster version of: [nlp(country) for country in COUNTRIES] | |
patterns = list(nlp.pipe(COUNTRIES)) | |
matcher.add("COUNTRY", None, *patterns) | |
# Call the matcher on the test document and print the result | |
matches = matcher(doc) | |
print([doc[start:end] for match_id, start, end in matches]) | |
########### Section 15 ########### | |
with open("exercises/countries.json") as f: | |
COUNTRIES = json.loads(f.read()) | |
with open("exercises/country_text.txt") as f: | |
TEXT = f.read() | |
nlp = English() | |
matcher = PhraseMatcher(nlp.vocab) | |
patterns = list(nlp.pipe(COUNTRIES)) | |
matcher.add("COUNTRY", None, *patterns) | |
# Create a doc and find matches in it | |
doc = nlp(TEXT) | |
# Iterate over the matches | |
for match_id, start, end in matcher(doc): | |
# Create a Span with the label for "GPE" | |
span = Span(doc, start, end, label="GPE") | |
# Overwrite the doc.ents and add the span | |
doc.ents = list(doc.ents) + [span] | |
# Get the span's root head token | |
span_root_head = span.root.head | |
# Print the text of the span root's head token and the span text | |
print(span_root_head.text, "-->", span.text) | |
# Print the entities in the document | |
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"]) | |
# =========================== Chapter 3 =========================== # | |
import json | |
import spacy | |
from spacy.matcher import PhraseMatcher | |
from spacy.tokens import Span, Token | |
######## Section 6 ######### | |
# Define the custom component | |
def length_component(doc): | |
# Get the doc's length | |
doc_length = len(doc) | |
print("This document is {} tokens long.".format(doc_length)) | |
# Return the doc | |
return doc | |
# Load the small English model | |
nlp = spacy.load("en_core_web_sm") | |
# Add the component first in the pipeline and print the pipe names | |
nlp.add_pipe(length_component, first=True) | |
print(nlp.pipe_names) | |
# Process a text | |
doc = nlp("Hello there") | |
######### Section 7 ######### | |
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"] | |
animal_patterns = list(nlp.pipe(animals)) | |
print("animal_patterns:", animal_patterns) | |
matcher = PhraseMatcher(nlp.vocab) | |
matcher.add("ANIMAL", None, *animal_patterns) | |
# Define the custom component | |
def animal_component(doc): | |
# Apply the matcher to the doc | |
matches = matcher(doc) | |
# Create a Span for each match and assign the label 'ANIMAL' | |
spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches] | |
# Overwrite the doc.ents with the matched spans | |
doc.ents = spans | |
return doc | |
# Add the component to the pipeline after the 'ner' component | |
nlp.add_pipe(animal_component, after='ner') | |
print(nlp.pipe_names) | |
# Process the text and print the text and label for the doc.ents | |
doc = nlp("I have a cat and a Golden Retriever") | |
print([(ent.text, ent.label_) for ent in doc.ents]) | |
########### Section 9 ###################### | |
def get_reversed(token): | |
return token.text[::-1] | |
# Register the Token extension attribute 'is_country' with the default value False | |
Token.set_extension('is_country', default=False) | |
Token.set_extension('reversed', getter=get_reversed) | |
# Process the text and set the is_country attribute to True for the token "Spain" | |
doc = nlp("I live in Spain.") | |
doc[3]._.is_country = True | |
# Print the token text and the is_country attribute for all tokens | |
print([(token, token._.is_country) for token in doc]) | |
print([(token, token._.reversed) for token in doc]) | |
########### Section 14 ###################### | |
with open("exercises/tweets.json") as f: | |
TEXTS = json.loads(f.read()) | |
# Process the texts and print the adjectives | |
for text in TEXTS: | |
doc = nlp(text) | |
print([token.text for token in doc if token.pos_ == "ADJ"]) | |
# Batch processing | |
docs = list(nlp.pipe(TEXTS)) | |
for doc in docs: | |
print([token.text for token in doc if token.pos_ == "ADJ"]) | |
# =========================== Chapter 4 =========================== # | |
import json | |
import random | |
import spacy | |
from spacy.matcher import Matcher | |
from spacy.lang.en import English | |
# | |
# ###### Create training data ######## | |
# with open("exercises/iphone.json") as f: | |
# TEXTS = json.loads(f.read()) | |
# | |
# nlp = English() | |
# matcher = Matcher(nlp.vocab) | |
# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] | |
# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}] | |
# patterns = [pattern1, pattern2] | |
# matcher.add("GADGET", None, *patterns) | |
# | |
# TRAINING_DATA = [] | |
# | |
# # Create a Doc object for each text in TEXTS | |
# for doc in nlp.pipe(TEXTS): | |
# # Match on the doc and create a list of matched spans | |
# spans = [doc[start:end] for match_id, start, end in matcher(doc)] | |
# | |
# # Get (start character, end character, label) tuples of matches | |
# entities = [(span.start_char, span.end_char, "GADGET") for span in spans] | |
# | |
# # Format the matches as a (doc.text, entities) tuple | |
# training_example = (doc.text, {"entities": entities}) | |
# | |
# # Append the example to the training data | |
# TRAINING_DATA.append(training_example) | |
# | |
# print(*TRAINING_DATA, sep="\n") | |
########## Section 6 - Setting up pipeline ############## | |
TRAINING_DATA = [ | |
["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }], | |
["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }], | |
["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }], | |
["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }], | |
["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }], | |
["I need a new phone! Any tips?", { "entities": [] }] | |
] | |
# Create a blank NLP | |
nlp = spacy.blank("en") | |
# Create a new NER module | |
ner = nlp.create_pipe("ner") | |
nlp.add_pipe(ner) | |
# Add the label 'GADGET' to the entity recognizer | |
ner.add_label("GADGET") | |
# Start the training | |
nlp.begin_training() | |
# Loop for 10 iterations | |
for itn in range(10): | |
# Shuffle the training data | |
random.shuffle(TRAINING_DATA) | |
losses = {} | |
# Batch the examples and iterate over them | |
for batch in spacy.util.minibatch(TRAINING_DATA, size=2): | |
texts = [text for text, entities in batch] | |
annotations = [entities for text, entities in batch] | |
# Update the model | |
nlp.update(texts, annotations, losses=losses) | |
print(losses) | |
test_texts = [ | |
"Apple is slowing down the iPhone 8 and iPhone X - how to stop it", | |
"I finally understand what the iPhone X ‘notch’ is for", | |
"Everything you need to know about the Samsung Galaxy S9", | |
"Looking to compare iPad models? Here’s how the 2018 lineup stacks up", | |
"The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple", | |
"what is the cheapest ipad, especially ipad pro???", | |
"Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics" | |
] | |
for doc in nlp.pipe(test_texts): | |
print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment