ai-content-maker/.venv/Lib/site-packages/tests/test_golden_rules.py

297 lines
12 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""
Test sentence segmentation against the "Golden Rules":
https://github.com/diasks2/pragmatic_segmenter#the-golden-rules
"""
import unittest
from gruut import sentences
class GoldenRulesTestCase(unittest.TestCase):
"""Test golden rules of sentence segmentation"""
def test_rule_1(self):
"""Simple period to end sentence"""
self.assertEqual(
_get_sentences("Hello World. My name is Jonas."),
["Hello World.", "My name is Jonas."],
)
def test_rule_2(self):
"""Question mark to end sentence"""
self.assertEqual(
_get_sentences("What is your name? My name is Jonas."),
["What is your name?", "My name is Jonas."],
)
def test_rule_3(self):
"""Exclamation point to end sentence"""
self.assertEqual(
_get_sentences("There it is! I found it."), ["There it is!", "I found it."],
)
def test_rule_4(self):
"""One letter upper case abbreviations"""
# NOTE: gruut removes the "." from E
self.assertEqual(
_get_sentences("My name is Jonas E. Smith."), ["My name is Jonas E Smith."],
)
def test_rule_5(self):
"""One letter lower case abbreviations"""
# NOTE: gruut removes the "." from p
self.assertEqual(
_get_sentences("Please turn to p. 55."), ["Please turn to p 55."],
)
def test_rule_6(self):
"""Two letter lower case abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("Were Jane and co. at the party?"),
["Were Jane and company at the party?"],
)
def test_rule_7(self):
"""Two letter upper case abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("They closed the deal with Pitt, Briggs & Co. at noon."),
["They closed the deal with Pitt, Briggs and Company at noon."],
)
# def test_rule_8(self):
# """Two letter lower case abbreviations at the end of a sentence"""
# self.assertEqual(
# _get_sentences("Let's ask Jane and co. They should know."),
# ["Let's ask Jane and company. They should know."],
# )
# def test_rule_9(self):
# """Two letter upper case abbreviations at the end of a sentence"""
# self.assertEqual(
# _get_sentences("They closed the deal with Pitt, Briggs & Co. It closed yesterday."),
# ["They closed the deal with Pitt, Briggs and Company. It closed yesterday."],
# )
def test_rule_10(self):
"""Two letter (prepositive) abbreviations"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I can see Mt. Fuji from here."),
["I can see Mount Fuji from here."],
)
# def test_rule_11(self):
# """Two letter (prepositive & postpositive) abbreviations"""
# self.assertEqual(
# _get_sentences("St. Michael's Church is on 5th st. near the light."),
# ["Saint Michael's Church is on 5th street near the light."],
# )
def test_rule_12(self):
"""Possesive two letter abbreviations"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("That is JFK Jr.'s book."), ["That is J F K Junior's book."],
)
def test_rule_13(self):
"""Multi-period abbreviations in the middle of a sentence"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I visited the U.S.A. last year."),
["I visited the U S A last year."],
)
# 14) Multi-period abbreviations at the end of a sentence
# I live in the E.U. How about you?
# ["I live in the E.U.", "How about you?"]
# 15) U.S. as sentence boundary
# I live in the U.S. How about you?
# ["I live in the U.S.", "How about you?"]
# 16) U.S. as non sentence boundary with next word capitalized
# I work for the U.S. Government in Virginia.
# ["I work for the U.S. Government in Virginia."]
def test_rule_17(self):
"""U.S. as non sentence boundary"""
# NOTE: gruut expands abbreviations
self.assertEqual(
_get_sentences("I have lived in the U.S. for 20 years."),
["I have lived in the U S for 20 years."],
)
# 18) A.M. / P.M. as non sentence boundary and sentence boundary
# At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.
# ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."]
def test_rule_19(self):
"""Number as non sentence boundary"""
self.assertEqual(
_get_sentences("She has $100.00 in her bag."),
["She has $100.00 in her bag."],
)
# 21) Parenthetical inside sentence
# He teaches science (He previously worked for 5 years as an engineer.) at the local University.
# ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]
# 22) Email addresses
# Her email is Jane.Doe@example.com. I sent her an email.
# ["Her email is Jane.Doe@example.com.", "I sent her an email."]
# 23) Web addresses
# The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.
# ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]
# 24) Single quotations inside sentence
# She turned to him, 'This is great.' she said.
# ["She turned to him, 'This is great.' she said."]
# 25) Double quotations inside sentence
# She turned to him, "This is great." she said.
# ["She turned to him, \"This is great.\" she said."]
# 26) Double quotations at the end of a sentence
# She turned to him, "This is great." She held the book out to show him.
# ["She turned to him, \"This is great.\"", "She held the book out to show him."]
def test_rule_27(self):
"""Double punctuation (exclamation point)"""
self.assertEqual(
_get_sentences("Hello!! Long time no see."),
["Hello!!", "Long time no see."],
)
def test_rule_28(self):
"""Double punctuation (question mark)"""
self.assertEqual(
_get_sentences("Hello?? Who is there?"), ["Hello??", "Who is there?"],
)
def test_rule_29(self):
"""Double punctuation (exclamation point / question mark)"""
self.assertEqual(
_get_sentences("Hello!? Is that you?"), ["Hello!?", "Is that you?"],
)
def test_rule_30(self):
"""Double punctuation (question mark / exclamation point)"""
self.assertEqual(
_get_sentences("Hello?! Is that you?"), ["Hello?!", "Is that you?"],
)
# 31) List (period followed by parens and no period to end item)
# 1.) The first item 2.) The second item
# ["1.) The first item", "2.) The second item"]
# 32) List (period followed by parens and period to end item)
# 1.) The first item. 2.) The second item.
# ["1.) The first item.", "2.) The second item."]
# 33) List (parens and no period to end item)
# 1) The first item 2) The second item
# ["1) The first item", "2) The second item"]
# 34) List (parens and period to end item)
# 1) The first item. 2) The second item.
# ["1) The first item.", "2) The second item."]
# 35) List (period to mark list and no period to end item)
# 1. The first item 2. The second item
# ["1. The first item", "2. The second item"]
# 36) List (period to mark list and period to end item)
# 1. The first item. 2. The second item.
# ["1. The first item.", "2. The second item."]
# 37) List with bullet
# • 9. The first item • 10. The second item
# ["• 9. The first item", "• 10. The second item"]
# 38) List with hypthen
# 9. The first item 10. The second item
# ["9. The first item", "10. The second item"]
# 39) Alphabetical list
# a. The first item b. The second item c. The third list item
# ["a. The first item", "b. The second item", "c. The third list item"]
def test_rule_40(self):
"""Errant newlines in the middle of sentences (PDF)"""
self.assertEqual(
_get_sentences("This is a sentence\ncut off in the middle because pdf."),
["This is a sentence cut off in the middle because pdf."],
)
def test_rule_41(self):
"""Errant newlines in the middle of sentences"""
self.assertEqual(
_get_sentences("It was a cold \nnight in the city."),
["It was a cold night in the city."],
)
# 42) Lower case list separated by newline
# features\ncontact manager\nevents, activities\n
# ["features", "contact manager", "events, activities"]
# 43) Geo Coordinates
# You can find it at N°. 1026.253.553. That is where the treasure is.
# ["You can find it at N°. 1026.253.553.", "That is where the treasure is."]
# 44) Named entities with an exclamation point
# She works at Yahoo! in the accounting department.
# ["She works at Yahoo! in the accounting department."]
# 45) I as a sentence boundary and I as an abbreviation
# We make a good team, you and I. Did you see Albert I. Jones yesterday?
# ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"]
# 46) Ellipsis at end of quotation
# Thoreau argues that by simplifying ones life, “the laws of the universe will appear less complex. . . .”
# ["Thoreau argues that by simplifying ones life, “the laws of the universe will appear less complex. . . .”"]
# 47) Ellipsis with square brackets
# "Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).
# ["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."]
# 48) Ellipsis as sentence boundary (standard ellipsis rules)
# If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.
# ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]
# 49) Ellipsis as sentence boundary (non-standard ellipsis rules)
# I never meant that.... She left the store.
# ["I never meant that....", "She left the store."]
# def test_rule_49(self):
# """Ellipsis as sentence boundary (non-standard ellipsis rules)"""
# self.assertEqual(
# _get_sentences("I never meant that.... She left the store."),
# ["I never meant that....", "She left the store."],
# )
# 50) Ellipsis as non sentence boundary
# I wasnt really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didnt mean it.
# ["I wasnt really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didnt mean it."]
# 51) 4-dot ellipsis
# One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .
# ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]
def _get_sentences(text):
return [
s.text
for s in sentences(text, verbalize_numbers=False, verbalize_currency=False)
]
# -----------------------------------------------------------------------------
if __name__ == "__main__":
unittest.main()