vinhkhuc · November 27, 2021 18:16
diff --git a/spacy_course_all_chapters.py b/spacy_course_all_chapters.py
 # Source: https://course.spacy.io/

 # =========================== Chapter 1 =========================== #
 # Import the English language class
 import spacy
 from spacy.lang.en import English
 from spacy.matcher import Matcher

 # Create the nlp object
 nlp = English()

 # Created by processing a string of text with the nlp object
 doc = nlp("Hello world!")

 # Iterate over tokens in a Doc
 for token in doc:
    print(token.text)

 span = doc[1:4]
 print(span.text)

 doc = nlp("It costs $5.")
 print('Index:   ', [token.i for token in doc])
 print('Text:    ', [token.text for token in doc])

 print('is_alpha:', [token.is_alpha for token in doc])
 print('is_punct:', [token.is_punct for token in doc])
 print('like_num:', [token.like_num for token in doc])

 # Text
 nlp = spacy.load('en_core_web_sm')
 doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

 # Iterate over the predicted entities
 for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

 # Predicting named entities in context
 text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

 # Process the text
 doc = nlp(text)

 # Iterate over the entities
 for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

 # Get the span for "iPhone X"
 iphone_x = doc[1:3]

 # Print the span text
 print("Missing entity:", iphone_x.text)

 # Section 11
 doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

 # Initialize the Matcher with the shared vocabulary
 matcher = Matcher(nlp.vocab)

 # Create a pattern matching two tokens: "iPhone" and "X"
 pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

 # Add the pattern to the matcher
 matcher.add("IPHONE_X_PATTERN", None, pattern)

 # Use the matcher on the doc
 matches = matcher(doc)
 print("Matches:", [doc[start:end].text for match_id, start, end in matches])

 # ============= Section 12 - Part 1 ============= #
 doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
 )

 # Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
 pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}]

 # Add the pattern to the matcher and apply the matcher to the doc
 matcher.add("IOS_VERSION_PATTERN", None, pattern)
 matches = matcher(doc)
 print("Total matches found:", len(matches))

 # Iterate over the matches and print the span text
 for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

 # ============= Section 12 - Part 2 ============= #
 doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
 )

 # Write a pattern that matches a form of "download" plus proper noun
 pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

 # Add the pattern to the matcher and apply the matcher to the doc
 matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
 matches = matcher(doc)
 print("Total matches found:", len(matches))

 # Iterate over the matches and print the span text
 for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

 # ============= Section 12 - Part 3 ============= #
 doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
 )

 # Write a pattern for adjective plus one or two nouns
 pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

 # Add the pattern to the matcher and apply the matcher to the doc
 matcher.add("ADJ_NOUN_PATTERN", None, pattern)
 matches = matcher(doc)
 print("Total matches found:", len(matches))

 # Iterate over the matches and print the span text
 for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
    
 # =========================== Chapter 2 =========================== #    

 import json
 import spacy
 from spacy.lang.en import English
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.tokens import Span


 nlp = spacy.load('en_core_web_sm')

 doc = nlp("coffee")
 coffee_hash = nlp.vocab.strings["coffee"]
 coffee_string = nlp.vocab.strings[coffee_hash]

 ###### Section 2 - Part 1 #######

 doc = nlp("I have a cat")

 # Look up the hash for the word "cat"
 cat_hash = nlp.vocab.strings["cat"]
 print(cat_hash)

 # Look up the cat_hash to get the string
 cat_string = nlp.vocab.strings[cat_hash]
 print(cat_string)

 ###### Section 2 - Part 2 #######
 doc = nlp("David Bowie is a PERSON")

 person_hash = nlp.vocab.strings["PERSON"]
 print(person_hash)

 person_string = nlp.vocab.strings[person_hash]
 print(person_string)

 ###### Section 7 #########
 doc = nlp("Berlin is a nice city")

 # Iterate over the tokens
 for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

 ###### Section 8 ##########
 # # Load a larger model with vectors
 # print("Loading medium model ...")
 # nlp = spacy.load('en_core_web_md')
 #
 # # Compare two documents
 # doc1 = nlp("I like fast food")
 # doc2 = nlp("I like pizza")
 # print(doc1.similarity(doc2))

 ####### Section 13 #########
 doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
 )

 # Create the match patterns
 pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
 pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

 # Initialize the Matcher and add the patterns
 matcher = Matcher(nlp.vocab)
 matcher.add("PATTERN1", None, pattern1)
 matcher.add("PATTERN2", None, pattern2)

 # Iterate over the matches
 for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

 ######### Section 14 ##########
 with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())

 nlp = English()
 doc = nlp("Czech Republic may help Slovakia protect its airspace")

 matcher = PhraseMatcher(nlp.vocab)

 # Create pattern Doc objects and add them to the matcher
 # This is the faster version of: [nlp(country) for country in COUNTRIES]
 patterns = list(nlp.pipe(COUNTRIES))
 matcher.add("COUNTRY", None, *patterns)

 # Call the matcher on the test document and print the result
 matches = matcher(doc)
 print([doc[start:end] for match_id, start, end in matches])

 ########### Section 15 ###########
 with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())

 with open("exercises/country_text.txt") as f:
    TEXT = f.read()

 nlp = English()
 matcher = PhraseMatcher(nlp.vocab)
 patterns = list(nlp.pipe(COUNTRIES))
 matcher.add("COUNTRY", None, *patterns)

 # Create a doc and find matches in it
 doc = nlp(TEXT)

 # Iterate over the matches
 for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, "-->", span.text)

 # Print the entities in the document
 print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])


 # =========================== Chapter 3 =========================== #

 import json

 import spacy
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Span, Token


 ######## Section 6 #########

 # Define the custom component
 def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc

 # Load the small English model
 nlp = spacy.load("en_core_web_sm")

 # Add the component first in the pipeline and print the pipe names
 nlp.add_pipe(length_component, first=True)
 print(nlp.pipe_names)

 # Process a text
 doc = nlp("Hello there")


 ######### Section 7 #########

 animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
 animal_patterns = list(nlp.pipe(animals))
 print("animal_patterns:", animal_patterns)

 matcher = PhraseMatcher(nlp.vocab)
 matcher.add("ANIMAL", None, *animal_patterns)

 # Define the custom component
 def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


 # Add the component to the pipeline after the 'ner' component
 nlp.add_pipe(animal_component, after='ner')
 print(nlp.pipe_names)

 # Process the text and print the text and label for the doc.ents
 doc = nlp("I have a cat and a Golden Retriever")
 print([(ent.text, ent.label_) for ent in doc.ents])

 ########### Section 9 ######################
 def get_reversed(token):
    return token.text[::-1]

 # Register the Token extension attribute 'is_country' with the default value False
 Token.set_extension('is_country', default=False)
 Token.set_extension('reversed', getter=get_reversed)

 # Process the text and set the is_country attribute to True for the token "Spain"
 doc = nlp("I live in Spain.")
 doc[3]._.is_country = True

 # Print the token text and the is_country attribute for all tokens
 print([(token, token._.is_country) for token in doc])

 print([(token, token._.reversed) for token in doc])

 ########### Section 14 ######################

 with open("exercises/tweets.json") as f:
    TEXTS = json.loads(f.read())

 # Process the texts and print the adjectives
 for text in TEXTS:
    doc = nlp(text)
    print([token.text for token in doc if token.pos_ == "ADJ"])

 # Batch processing
 docs = list(nlp.pipe(TEXTS))
 for doc in docs:
    print([token.text for token in doc if token.pos_ == "ADJ"])

 # =========================== Chapter 4 =========================== #

 import json
 import random

 import spacy
 from spacy.matcher import Matcher
 from spacy.lang.en import English
 #
 # ###### Create training data ########
 # with open("exercises/iphone.json") as f:
 #     TEXTS = json.loads(f.read())
 #
 # nlp = English()
 # matcher = Matcher(nlp.vocab)
 # pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
 # pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
 # patterns = [pattern1, pattern2]
 # matcher.add("GADGET", None, *patterns)
 #
 # TRAINING_DATA = []
 #
 # # Create a Doc object for each text in TEXTS
 # for doc in nlp.pipe(TEXTS):
 #     # Match on the doc and create a list of matched spans
 #     spans = [doc[start:end] for match_id, start, end in matcher(doc)]
 #
 #     # Get (start character, end character, label) tuples of matches
 #     entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
 #
 #     # Format the matches as a (doc.text, entities) tuple
 #     training_example = (doc.text, {"entities": entities})
 #
 #     # Append the example to the training data
 #     TRAINING_DATA.append(training_example)
 #
 # print(*TRAINING_DATA, sep="\n")

 ########## Section 6 - Setting up pipeline ##############
 TRAINING_DATA = [
    ["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
    ["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
    ["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
    ["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
    ["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
    ["I need a new phone! Any tips?", { "entities": [] }]
 ]

 # Create a blank NLP
 nlp = spacy.blank("en")

 # Create a new NER module
 ner = nlp.create_pipe("ner")
 nlp.add_pipe(ner)

 # Add the label 'GADGET' to the entity recognizer
 ner.add_label("GADGET")

 # Start the training
 nlp.begin_training()

 # Loop for 10 iterations
 for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)


 test_texts = [
    "Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
    "I finally understand what the iPhone X ‘notch’ is for",
    "Everything you need to know about the Samsung Galaxy S9",
    "Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
    "The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
    "what is the cheapest ipad, especially ipad pro???",
    "Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"
 ]

 for doc in nlp.pipe(test_texts):
    print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents])
	# Source: https://course.spacy.io/

	# =========================== Chapter 1 =========================== #
	# Import the English language class
	import spacy
	from spacy.lang.en import English
	from spacy.matcher import Matcher

	# Create the nlp object
	nlp = English()

	# Created by processing a string of text with the nlp object
	doc = nlp("Hello world!")

	# Iterate over tokens in a Doc
	for token in doc:
	print(token.text)

	span = doc[1:4]
	print(span.text)

	doc = nlp("It costs $5.")
	print('Index: ', [token.i for token in doc])
	print('Text: ', [token.text for token in doc])

	print('is_alpha:', [token.is_alpha for token in doc])
	print('is_punct:', [token.is_punct for token in doc])
	print('like_num:', [token.like_num for token in doc])

	# Text
	nlp = spacy.load('en_core_web_sm')
	doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

	# Iterate over the predicted entities
	for ent in doc.ents:
	# Print the entity text and its label
	print(ent.text, ent.label_)

	# Predicting named entities in context
	text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

	# Process the text
	doc = nlp(text)

	# Iterate over the entities
	for ent in doc.ents:
	# Print the entity text and label
	print(ent.text, ent.label_)

	# Get the span for "iPhone X"
	iphone_x = doc[1:3]

	# Print the span text
	print("Missing entity:", iphone_x.text)

	# Section 11
	doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

	# Initialize the Matcher with the shared vocabulary
	matcher = Matcher(nlp.vocab)

	# Create a pattern matching two tokens: "iPhone" and "X"
	pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

	# Add the pattern to the matcher
	matcher.add("IPHONE_X_PATTERN", None, pattern)

	# Use the matcher on the doc
	matches = matcher(doc)
	print("Matches:", [doc[start:end].text for match_id, start, end in matches])

	# ============= Section 12 - Part 1 ============= #
	doc = nlp(
	"After making the iOS update you won't notice a radical system-wide "
	"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
	"iOS 11's furniture remains the same as in iOS 10. But you will discover "
	"some tweaks once you delve a little deeper."
	)

	# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
	pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("IOS_VERSION_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# ============= Section 12 - Part 2 ============= #
	doc = nlp(
	"i downloaded Fortnite on my laptop and can't open the game at all. Help? "
	"so when I was downloading Minecraft, I got the Windows version where it "
	"is the '.zip' folder and I used the default program to unpack it... do "
	"I also need to download Winzip?"
	)

	# Write a pattern that matches a form of "download" plus proper noun
	pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# ============= Section 12 - Part 3 ============= #
	doc = nlp(
	"Features of the app include a beautiful design, smart search, automatic "
	"labels and optional voice responses."
	)

	# Write a pattern for adjective plus one or two nouns
	pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("ADJ_NOUN_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# =========================== Chapter 2 =========================== #

	import json
	import spacy
	from spacy.lang.en import English
	from spacy.matcher import Matcher, PhraseMatcher
	from spacy.tokens import Span


	nlp = spacy.load('en_core_web_sm')

	doc = nlp("coffee")
	coffee_hash = nlp.vocab.strings["coffee"]
	coffee_string = nlp.vocab.strings[coffee_hash]

	###### Section 2 - Part 1 #######

	doc = nlp("I have a cat")

	# Look up the hash for the word "cat"
	cat_hash = nlp.vocab.strings["cat"]
	print(cat_hash)

	# Look up the cat_hash to get the string
	cat_string = nlp.vocab.strings[cat_hash]
	print(cat_string)

	###### Section 2 - Part 2 #######
	doc = nlp("David Bowie is a PERSON")

	person_hash = nlp.vocab.strings["PERSON"]
	print(person_hash)

	person_string = nlp.vocab.strings[person_hash]
	print(person_string)

	###### Section 7 #########
	doc = nlp("Berlin is a nice city")

	# Iterate over the tokens
	for token in doc:
	# Check if the current token is a proper noun
	if token.pos_ == "PROPN":
	# Check if the next token is a verb
	if doc[token.i + 1].pos_ == "VERB":
	print("Found proper noun before a verb:", token.text)

	###### Section 8 ##########
	# # Load a larger model with vectors
	# print("Loading medium model ...")
	# nlp = spacy.load('en_core_web_md')
	#
	# # Compare two documents
	# doc1 = nlp("I like fast food")
	# doc2 = nlp("I like pizza")
	# print(doc1.similarity(doc2))

	####### Section 13 #########
	doc = nlp(
	"Twitch Prime, the perks program for Amazon Prime members offering free "
	"loot, games and other benefits, is ditching one of its best features: "
	"ad-free viewing. According to an email sent out to Amazon Prime members "
	"today, ad-free viewing will no longer be included as a part of Twitch "
	"Prime for new members, beginning on September 14. However, members with "
	"existing annual subscriptions will be able to continue to enjoy ad-free "
	"viewing until their subscription comes up for renewal. Those with "
	"monthly subscriptions will have access to ad-free viewing until October 15."
	)

	# Create the match patterns
	pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
	pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

	# Initialize the Matcher and add the patterns
	matcher = Matcher(nlp.vocab)
	matcher.add("PATTERN1", None, pattern1)
	matcher.add("PATTERN2", None, pattern2)

	# Iterate over the matches
	for match_id, start, end in matcher(doc):
	# Print pattern string name and text of matched span
	print(doc.vocab.strings[match_id], doc[start:end].text)

	######### Section 14 ##########
	with open("exercises/countries.json") as f:
	COUNTRIES = json.loads(f.read())

	nlp = English()
	doc = nlp("Czech Republic may help Slovakia protect its airspace")

	matcher = PhraseMatcher(nlp.vocab)

	# Create pattern Doc objects and add them to the matcher
	# This is the faster version of: [nlp(country) for country in COUNTRIES]
	patterns = list(nlp.pipe(COUNTRIES))
	matcher.add("COUNTRY", None, *patterns)

	# Call the matcher on the test document and print the result
	matches = matcher(doc)
	print([doc[start:end] for match_id, start, end in matches])

	########### Section 15 ###########
	with open("exercises/countries.json") as f:
	COUNTRIES = json.loads(f.read())

	with open("exercises/country_text.txt") as f:
	TEXT = f.read()

	nlp = English()
	matcher = PhraseMatcher(nlp.vocab)
	patterns = list(nlp.pipe(COUNTRIES))
	matcher.add("COUNTRY", None, *patterns)

	# Create a doc and find matches in it
	doc = nlp(TEXT)

	# Iterate over the matches
	for match_id, start, end in matcher(doc):
	# Create a Span with the label for "GPE"
	span = Span(doc, start, end, label="GPE")

	# Overwrite the doc.ents and add the span
	doc.ents = list(doc.ents) + [span]

	# Get the span's root head token
	span_root_head = span.root.head
	# Print the text of the span root's head token and the span text
	print(span_root_head.text, "-->", span.text)

	# Print the entities in the document
	print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])


	# =========================== Chapter 3 =========================== #

	import json

	import spacy
	from spacy.matcher import PhraseMatcher
	from spacy.tokens import Span, Token


	######## Section 6 #########

	# Define the custom component
	def length_component(doc):
	# Get the doc's length
	doc_length = len(doc)
	print("This document is {} tokens long.".format(doc_length))
	# Return the doc
	return doc

	# Load the small English model
	nlp = spacy.load("en_core_web_sm")

	# Add the component first in the pipeline and print the pipe names
	nlp.add_pipe(length_component, first=True)
	print(nlp.pipe_names)

	# Process a text
	doc = nlp("Hello there")


	######### Section 7 #########

	animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
	animal_patterns = list(nlp.pipe(animals))
	print("animal_patterns:", animal_patterns)

	matcher = PhraseMatcher(nlp.vocab)
	matcher.add("ANIMAL", None, *animal_patterns)

	# Define the custom component
	def animal_component(doc):
	# Apply the matcher to the doc
	matches = matcher(doc)
	# Create a Span for each match and assign the label 'ANIMAL'
	spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
	# Overwrite the doc.ents with the matched spans
	doc.ents = spans
	return doc


	# Add the component to the pipeline after the 'ner' component
	nlp.add_pipe(animal_component, after='ner')
	print(nlp.pipe_names)

	# Process the text and print the text and label for the doc.ents
	doc = nlp("I have a cat and a Golden Retriever")
	print([(ent.text, ent.label_) for ent in doc.ents])

	########### Section 9 ######################
	def get_reversed(token):
	return token.text[::-1]

	# Register the Token extension attribute 'is_country' with the default value False
	Token.set_extension('is_country', default=False)
	Token.set_extension('reversed', getter=get_reversed)

	# Process the text and set the is_country attribute to True for the token "Spain"
	doc = nlp("I live in Spain.")
	doc[3]._.is_country = True

	# Print the token text and the is_country attribute for all tokens
	print([(token, token._.is_country) for token in doc])

	print([(token, token._.reversed) for token in doc])

	########### Section 14 ######################

	with open("exercises/tweets.json") as f:
	TEXTS = json.loads(f.read())

	# Process the texts and print the adjectives
	for text in TEXTS:
	doc = nlp(text)
	print([token.text for token in doc if token.pos_ == "ADJ"])

	# Batch processing
	docs = list(nlp.pipe(TEXTS))
	for doc in docs:
	print([token.text for token in doc if token.pos_ == "ADJ"])

	# =========================== Chapter 4 =========================== #

	import json
	import random

	import spacy
	from spacy.matcher import Matcher
	from spacy.lang.en import English
	#
	# ###### Create training data ########
	# with open("exercises/iphone.json") as f:
	# TEXTS = json.loads(f.read())
	#
	# nlp = English()
	# matcher = Matcher(nlp.vocab)
	# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
	# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
	# patterns = [pattern1, pattern2]
	# matcher.add("GADGET", None, *patterns)
	#
	# TRAINING_DATA = []
	#
	# # Create a Doc object for each text in TEXTS
	# for doc in nlp.pipe(TEXTS):
	# # Match on the doc and create a list of matched spans
	# spans = [doc[start:end] for match_id, start, end in matcher(doc)]
	#
	# # Get (start character, end character, label) tuples of matches
	# entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
	#
	# # Format the matches as a (doc.text, entities) tuple
	# training_example = (doc.text, {"entities": entities})
	#
	# # Append the example to the training data
	# TRAINING_DATA.append(training_example)
	#
	# print(*TRAINING_DATA, sep="\n")

	########## Section 6 - Setting up pipeline ##############
	TRAINING_DATA = [
	["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
	["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
	["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
	["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
	["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
	["I need a new phone! Any tips?", { "entities": [] }]
	]

	# Create a blank NLP
	nlp = spacy.blank("en")

	# Create a new NER module
	ner = nlp.create_pipe("ner")
	nlp.add_pipe(ner)

	# Add the label 'GADGET' to the entity recognizer
	ner.add_label("GADGET")

	# Start the training
	nlp.begin_training()

	# Loop for 10 iterations
	for itn in range(10):
	# Shuffle the training data
	random.shuffle(TRAINING_DATA)
	losses = {}

	# Batch the examples and iterate over them
	for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
	texts = [text for text, entities in batch]
	annotations = [entities for text, entities in batch]

	# Update the model
	nlp.update(texts, annotations, losses=losses)
	print(losses)


	test_texts = [
	"Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
	"I finally understand what the iPhone X ‘notch’ is for",
	"Everything you need to know about the Samsung Galaxy S9",
	"Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
	"The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
	"what is the cheapest ipad, especially ipad pro???",
	"Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"
	]

	for doc in nlp.pipe(test_texts):
	print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents])