ai-content-maker/.venv/Lib/site-packages/tests/test_golden_rules.py

297 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Test sentence segmentation against the "Golden Rules":
https://github.com/diasks2/pragmatic_segmenter#the-golden-rules
"""
import unittest
from gruut import sentences
class GoldenRulesTestCase(unittest.TestCase):
"""Test golden rules of sentence segmentation"""
def test_rule_1(self):
"""Simple period to end sentence"""
self.assertEqual(
_get_sentences("Hello World. My name is Jonas."),
["Hello World.", "My name is Jonas."],
)
def test_rule_2(self):
"""Question mark to end sentence"""
self.assertEqual(
_get_sentences("What is your name? My name is Jonas."),
["What is your name?", "My name is Jonas."],
)
def test_rule_3(self):
"""Exclamation point to end sentence"""
self.assertEqual(
_get_sentences("There it is! I found it."), ["There it is!", "I found it."],
)
def test_rule_4(self):
"""One letter upper case abbreviations"""
# NOTE: gruut removes the "." from E
self.assertEqual(
_get_sentences("My name is Jonas E. Smith."), ["My name is Jonas E Smith."],
)
def test_rule_5(self):
"""One letter lower case abbreviations"""
# NOTE: gruut removes the "." from p
self.assertEqual(
_get_sentences("Please turn to p. 55."), ["Please turn to p 55."],
)
def test_rule_6(self):
"""Two letter lower case abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("Were Jane and co. at the party?"),
["Were Jane and company at the party?"],
)
def test_rule_7(self):
"""Two letter upper case abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("They closed the deal with Pitt, Briggs & Co. at noon."),
["They closed the deal with Pitt, Briggs and Company at noon."],
)
# def test_rule_8(self):
# """Two letter lower case abbreviations at the end of a sentence"""
# self.assertEqual(
# _get_sentences("Let's ask Jane and co. They should know."),
# ["Let's ask Jane and company. They should know."],
# )
# def test_rule_9(self):
# """Two letter upper case abbreviations at the end of a sentence"""
# self.assertEqual(
# _get_sentences("They closed the deal with Pitt, Briggs & Co. It closed yesterday."),
# ["They closed the deal with Pitt, Briggs and Company. It closed yesterday."],
# )
def test_rule_10(self):
"""Two letter (prepositive) abbreviations"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I can see Mt. Fuji from here."),
["I can see Mount Fuji from here."],
)
# def test_rule_11(self):
# """Two letter (prepositive & postpositive) abbreviations"""
# self.assertEqual(
# _get_sentences("St. Michael's Church is on 5th st. near the light."),
# ["Saint Michael's Church is on 5th street near the light."],
# )
def test_rule_12(self):
"""Possesive two letter abbreviations"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("That is JFK Jr.'s book."), ["That is J F K Junior's book."],
)
def test_rule_13(self):
"""Multi-period abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I visited the U.S.A. last year."),
["I visited the U S A last year."],
)
# 14) Multi-period abbreviations at the end of a sentence
# I live in the E.U. How about you?
# ["I live in the E.U.", "How about you?"]
# 15) U.S. as sentence boundary
# I live in the U.S. How about you?
# ["I live in the U.S.", "How about you?"]
# 16) U.S. as non sentence boundary with next word capitalized
# I work for the U.S. Government in Virginia.
# ["I work for the U.S. Government in Virginia."]
def test_rule_17(self):
"""U.S. as non sentence boundary"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I have lived in the U.S. for 20 years."),
["I have lived in the U S for 20 years."],
)
# 18) A.M. / P.M. as non sentence boundary and sentence boundary
# At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
# ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."]
def test_rule_19(self):
"""Number as non sentence boundary"""
self.assertEqual(
_get_sentences("She has $100.00 in her bag."),
["She has $100.00 in her bag."],
)
# 21) Parenthetical inside sentence
# He teaches science (He previously worked for 5 years as an engineer.) at the local University.
# ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]
# 22) Email addresses
# Her email is Jane.Doe@example.com. I sent her an email.
# ["Her email is Jane.Doe@example.com.", "I sent her an email."]
# 23) Web addresses
# The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
# ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]
# 24) Single quotations inside sentence
# She turned to him, 'This is great.' she said.
# ["She turned to him, 'This is great.' she said."]
# 25) Double quotations inside sentence
# She turned to him, "This is great." she said.
# ["She turned to him, \"This is great.\" she said."]
# 26) Double quotations at the end of a sentence
# She turned to him, "This is great." She held the book out to show him.
# ["She turned to him, \"This is great.\"", "She held the book out to show him."]
def test_rule_27(self):
"""Double punctuation (exclamation point)"""
self.assertEqual(
_get_sentences("Hello!! Long time no see."),
["Hello!!", "Long time no see."],
)
def test_rule_28(self):
"""Double punctuation (question mark)"""
self.assertEqual(
_get_sentences("Hello?? Who is there?"), ["Hello??", "Who is there?"],
)
def test_rule_29(self):
"""Double punctuation (exclamation point / question mark)"""
self.assertEqual(
_get_sentences("Hello!? Is that you?"), ["Hello!?", "Is that you?"],
)
def test_rule_30(self):
"""Double punctuation (question mark / exclamation point)"""
self.assertEqual(
_get_sentences("Hello?! Is that you?"), ["Hello?!", "Is that you?"],
)
# 31) List (period followed by parens and no period to end item)
# 1.) The first item 2.) The second item
# ["1.) The first item", "2.) The second item"]
# 32) List (period followed by parens and period to end item)
# 1.) The first item. 2.) The second item.
# ["1.) The first item.", "2.) The second item."]
# 33) List (parens and no period to end item)
# 1) The first item 2) The second item
# ["1) The first item", "2) The second item"]
# 34) List (parens and period to end item)
# 1) The first item. 2) The second item.
# ["1) The first item.", "2) The second item."]
# 35) List (period to mark list and no period to end item)
# 1. The first item 2. The second item
# ["1. The first item", "2. The second item"]
# 36) List (period to mark list and period to end item)
# 1. The first item. 2. The second item.
# ["1. The first item.", "2. The second item."]
# 37) List with bullet
# • 9. The first item • 10. The second item
# ["• 9. The first item", "• 10. The second item"]
# 38) List with hypthen
# 9. The first item 10. The second item
# ["9. The first item", "10. The second item"]
# 39) Alphabetical list
# a. The first item b. The second item c. The third list item
# ["a. The first item", "b. The second item", "c. The third list item"]
def test_rule_40(self):
"""Errant newlines in the middle of sentences (PDF)"""
self.assertEqual(
_get_sentences("This is a sentence\ncut off in the middle because pdf."),
["This is a sentence cut off in the middle because pdf."],
)
def test_rule_41(self):
"""Errant newlines in the middle of sentences"""
self.assertEqual(
_get_sentences("It was a cold \nnight in the city."),
["It was a cold night in the city."],
)
# 42) Lower case list separated by newline
# features\ncontact manager\nevents, activities\n
# ["features", "contact manager", "events, activities"]
# 43) Geo Coordinates
# You can find it at N°. 1026.253.553. That is where the treasure is.
# ["You can find it at N°. 1026.253.553.", "That is where the treasure is."]
# 44) Named entities with an exclamation point
# She works at Yahoo! in the accounting department.
# ["She works at Yahoo! in the accounting department."]
# 45) I as a sentence boundary and I as an abbreviation
# We make a good team, you and I. Did you see Albert I. Jones yesterday?
# ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"]
# 46) Ellipsis at end of quotation
# Thoreau argues that by simplifying ones life, “the laws of the universe will appear less complex. . . .”
# ["Thoreau argues that by simplifying ones life, “the laws of the universe will appear less complex. . . .”"]
# 47) Ellipsis with square brackets
# "Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
# ["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."]
# 48) Ellipsis as sentence boundary (standard ellipsis rules)
# If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
# ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]
# 49) Ellipsis as sentence boundary (non-standard ellipsis rules)
# I never meant that.... She left the store.
# ["I never meant that....", "She left the store."]
# def test_rule_49(self):
# """Ellipsis as sentence boundary (non-standard ellipsis rules)"""
# self.assertEqual(
# _get_sentences("I never meant that.... She left the store."),
# ["I never meant that....", "She left the store."],
# )
# 50) Ellipsis as non sentence boundary
# I wasnt really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didnt mean it.
# ["I wasnt really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didnt mean it."]
# 51) 4-dot ellipsis
# One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
# ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]
def _get_sentences(text):
return [
s.text
for s in sentences(text, verbalize_numbers=False, verbalize_currency=False)
]
# -----------------------------------------------------------------------------
if __name__ == "__main__":
unittest.main()