241 lines
10 KiB
Python
241 lines
10 KiB
Python
# -*- coding: utf-8 -*-
|
||
import string
|
||
import re
|
||
from pysbd.utils import Rule, Text
|
||
from functools import partial
|
||
|
||
|
||
class ListItemReplacer(object):
|
||
|
||
ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
|
||
LATIN_NUMERALS = list(string.ascii_lowercase)
|
||
|
||
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
||
ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'
|
||
|
||
# Rubular: http://rubular.com/r/Gu5rQapywf
|
||
# TODO: Make sure below regex call is case-insensitive
|
||
ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
|
||
|
||
# (pattern, replacement)
|
||
SubstituteListPeriodRule = Rule('♨', '∯')
|
||
ListMarkerRule = Rule('☝', '')
|
||
|
||
# Rubular: http://rubular.com/r/Wv4qLdoPx7
|
||
# https://regex101.com/r/62YBlv/1
|
||
SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")
|
||
|
||
# Rubular: http://rubular.com/r/AizHXC6HxK
|
||
# https://regex101.com/r/62YBlv/2
|
||
SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")
|
||
|
||
# Rubular: http://rubular.com/r/GE5q6yID2j
|
||
# https://regex101.com/r/62YBlv/3
|
||
SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")
|
||
|
||
NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
|
||
# 1. abcd
|
||
# 2. xyz
|
||
NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
|
||
# 1) abcd
|
||
# 2) xyz
|
||
NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'
|
||
|
||
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
||
# TODO: Make sure below regex call is case-insensitive
|
||
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
|
||
|
||
# Rubular: http://rubular.com/r/wMpnVedEIb
|
||
# TODO: Make sure below regex call is case-insensitive
|
||
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'
|
||
|
||
# Rubular: http://rubular.com/r/GcnmQt4a3I
|
||
ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'
|
||
|
||
def __init__(self, text):
|
||
self.text = text
|
||
|
||
def add_line_break(self):
|
||
self.format_alphabetical_lists()
|
||
self.format_roman_numeral_lists()
|
||
self.format_numbered_list_with_periods()
|
||
self.format_numbered_list_with_parens()
|
||
return self.text
|
||
|
||
def replace_parens(self):
|
||
text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
|
||
r'&✂&\1&⌬&', self.text)
|
||
return text
|
||
|
||
def format_numbered_list_with_parens(self):
|
||
self.replace_parens_in_numbered_list()
|
||
self.add_line_breaks_for_numbered_list_with_parens()
|
||
self.text = Text(self.text).apply(self.ListMarkerRule)
|
||
|
||
def replace_periods_in_numbered_list(self):
|
||
self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
|
||
'♨', strip=True)
|
||
|
||
def format_numbered_list_with_periods(self):
|
||
self.replace_periods_in_numbered_list()
|
||
self.add_line_breaks_for_numbered_list_with_periods()
|
||
self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
|
||
|
||
def format_alphabetical_lists(self):
|
||
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
|
||
roman_numeral=False)
|
||
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
|
||
roman_numeral=False)
|
||
return self.txt
|
||
|
||
def format_roman_numeral_lists(self):
|
||
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
|
||
roman_numeral=True)
|
||
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
|
||
roman_numeral=True)
|
||
return self.txt
|
||
|
||
def add_line_breaks_for_alphabetical_list_with_periods(
|
||
self, roman_numeral=False):
|
||
txt = self.iterate_alphabet_array(
|
||
self.ALPHABETICAL_LIST_WITH_PERIODS,
|
||
roman_numeral=roman_numeral)
|
||
return txt
|
||
|
||
def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
|
||
txt = self.iterate_alphabet_array(
|
||
self.ALPHABETICAL_LIST_WITH_PARENS,
|
||
parens=True,
|
||
roman_numeral=roman_numeral)
|
||
return txt
|
||
|
||
def scan_lists(self, regex1, regex2, replacement, strip=False):
|
||
list_array = re.findall(regex1, self.text)
|
||
list_array = list(map(int, list_array))
|
||
for ind, item in enumerate(list_array):
|
||
# to avoid IndexError
|
||
# ruby returns nil if index is out of range
|
||
if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
|
||
self.substitute_found_list_items(regex2, item, strip, replacement)
|
||
elif ind > 0:
|
||
if (((item - 1) == list_array[ind - 1]) or
|
||
((item == 0) and (list_array[ind - 1] == 9)) or
|
||
((item == 9) and (list_array[ind - 1] == 0))):
|
||
self.substitute_found_list_items(regex2, item, strip, replacement)
|
||
|
||
def substitute_found_list_items(self, regex, each, strip, replacement):
|
||
|
||
def replace_item(match, val=None, strip=False, repl='♨'):
|
||
match = match.group()
|
||
if strip:
|
||
match = str(match).strip()
|
||
chomped_match = match if len(match) == 1 else match.strip('.])')
|
||
if str(each) == chomped_match:
|
||
return "{}{}".format(each, replacement)
|
||
else:
|
||
return str(match)
|
||
|
||
self.text = re.sub(regex, partial(replace_item, val=each,
|
||
strip=strip, repl=replacement), self.text)
|
||
|
||
def add_line_breaks_for_numbered_list_with_periods(self):
|
||
if ('♨' in self.text) and (not re.search(
|
||
'♨.+(\n|\r).+♨', self.text)) and (not re.search(
|
||
r'for\s\d{1,2}♨\s[a-z]', self.text)):
|
||
self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
|
||
self.SpaceBetweenListItemsSecondRule)
|
||
|
||
def replace_parens_in_numbered_list(self):
|
||
self.scan_lists(
|
||
self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
|
||
self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
|
||
|
||
def add_line_breaks_for_numbered_list_with_parens(self):
|
||
if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
|
||
self.text = Text(self.text).apply(
|
||
self.SpaceBetweenListItemsThirdRule)
|
||
|
||
def replace_alphabet_list(self, a):
|
||
"""
|
||
Input: 'a. ffegnog b. fgegkl c.'
|
||
Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
|
||
"""
|
||
|
||
def replace_letter_period(match, val=None):
|
||
match = match.group()
|
||
match_wo_period = match.strip('.')
|
||
if match_wo_period == val:
|
||
return '\r{}∯'.format(match_wo_period)
|
||
else:
|
||
return match
|
||
|
||
txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
|
||
partial(replace_letter_period, val=a),
|
||
self.text, flags=re.IGNORECASE)
|
||
return txt
|
||
|
||
def replace_alphabet_list_parens(self, a):
|
||
"""
|
||
Input: "a) ffegnog (b) fgegkl c)"
|
||
Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
|
||
"""
|
||
|
||
def replace_alphabet_paren(match, val=None):
|
||
match = match.group()
|
||
if '(' in match:
|
||
match_wo_paren = match.strip('(')
|
||
if match_wo_paren == val:
|
||
return '\r&✂&{}'.format(match_wo_paren)
|
||
else:
|
||
return match
|
||
else:
|
||
if match == val:
|
||
return '\r{}'.format(match)
|
||
else:
|
||
return match
|
||
|
||
# Make it cases-insensitive
|
||
txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
|
||
partial(replace_alphabet_paren, val=a),
|
||
self.text, flags=re.IGNORECASE)
|
||
return txt
|
||
|
||
def replace_correct_alphabet_list(self, a, parens):
|
||
if parens:
|
||
a = self.replace_alphabet_list_parens(a)
|
||
else:
|
||
a = self.replace_alphabet_list(a)
|
||
return a
|
||
|
||
def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
|
||
if (len(alphabet) == 0) & (len(list_array) == 0) or (
|
||
list_array[i - 1] not in alphabet) or (a not in alphabet):
|
||
return self.text
|
||
if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
|
||
return self.text
|
||
result = self.replace_correct_alphabet_list(a, parens)
|
||
return result
|
||
|
||
def other_items_replacement(self, a, i, alphabet, list_array, parens):
|
||
if (len(alphabet) == 0) & (len(list_array) == 0) or (
|
||
list_array[i - 1] not in alphabet) or (a not in alphabet) or (
|
||
list_array[i + 1] not in alphabet):
|
||
return self.text
|
||
if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
|
||
abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
|
||
return self.text
|
||
result = self.replace_correct_alphabet_list(a, parens)
|
||
return result
|
||
|
||
def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
|
||
list_array = re.findall(regex, self.text)
|
||
alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
|
||
list_array = [i for i in list_array if i in alphabet]
|
||
for ind, each in enumerate(list_array):
|
||
if ind == len(list_array) - 1:
|
||
self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
|
||
else:
|
||
self.text = self.other_items_replacement(
|
||
each, ind, alphabet, list_array, parens)
|
||
return self.text
|