Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I have implemented parse_roman() function #64

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
61f57d7
Implemented parse_roman() function
AmPhIbIaN26 Apr 17, 2021
6664b6f
Moved test_numeral_roman.py
AmPhIbIaN26 Apr 17, 2021
376db40
Implemented roman_numera()l function
AmPhIbIaN26 Apr 18, 2021
ce4e5c6
Adding numeral_system as a parameter to functions
AmPhIbIaN26 Apr 24, 2021
6d420a7
added _valid_input_by_numeral_system()
AmPhIbIaN26 Apr 28, 2021
bf7d6e3
Added NUMERAL_SYSTEMS
AmPhIbIaN26 May 3, 2021
5104b76
fixed Unicode issue for Hindi, Spanish and Russian
AmPhIbIaN26 May 3, 2021
94d9441
Update test_numeral_roman.py
AmPhIbIaN26 May 3, 2021
475ad04
Update test_number_parsing.py
AmPhIbIaN26 May 3, 2021
f847cf6
Merge pull request #1 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 3, 2021
45bfdc5
Update README.rst
AmPhIbIaN26 May 4, 2021
c13102f
Merge pull request #2 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 4, 2021
973664b
Delete test.py
AmPhIbIaN26 May 4, 2021
0e89680
Made all the changes and added support for incorrect roman numbers
AmPhIbIaN26 May 6, 2021
9704eff
Removed .lower() from roman regex expressions
AmPhIbIaN26 May 6, 2021
554d553
Update test_numeral_systems.py
AmPhIbIaN26 May 6, 2021
77ae66f
Merge pull request #3 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 6, 2021
fb9bff7
Update number_parser/parser.py
AmPhIbIaN26 May 8, 2021
480cf7c
Update number_parser/parser.py
AmPhIbIaN26 May 8, 2021
faecf42
Merge pull request #4 from AmPhIbIaN26/parse_roman(regex-approach)
AmPhIbIaN26 May 8, 2021
cbc4661
Added more test cases for better code coverage
AmPhIbIaN26 May 12, 2021
086f5ec
Merge pull request #5 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Identifying the numbers in a text string, converting them to corresponding numer
This also supports ordinal number conversion (for English only).

>>> from number_parser import parse
>>> parse("I have two hats and thirty seven coats")
>>> parse("I have two hats and thirty seven coats", numeral_systems=['decimal'])
'I have 2 hats and 37 coats'
>>> parse("One, Two, Three go")
'1, 2, 3 go'
Expand Down
2 changes: 1 addition & 1 deletion number_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction, NUMERAL_SYSTEMS
106 changes: 90 additions & 16 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import re
from importlib import import_module
import unicodedata

SENTENCE_SEPARATORS = [".", ","]
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
RE_BUG_LANGUAGES = ['hi']
NUMERAL_SYSTEMS = ('decimal', 'roman')
ROMAN_REGEX_EXPRESSION = "(?i)^(m{0,3})(cm|cd|d?c{0,4})(xc|xl|l?x{0,4})(ix|iv|v?i{0,4})$"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let’s make this a private constant, so that we can freely rename it or move it in the future if we wish without breaking the API:

Suggested change
ROMAN_REGEX_EXPRESSION = "(?i)^(m{0,3})(cm|cd|d?c{0,4})(xc|xl|l?x{0,4})(ix|iv|v?i{0,4})$"
_ROMAN_REGEX_EXPRESSION = "(?i)^(m{0,3})(cm|cd|d?c{0,4})(xc|xl|l?x{0,4})(ix|iv|v?i{0,4})$"



class LanguageData:
Expand Down Expand Up @@ -241,7 +244,11 @@ def parse_ordinal(input_string, language=None):
return parse_number(output_string, language)


def parse_number(input_string, language=None):
def _is_roman(search_string):
return re.search(ROMAN_REGEX_EXPRESSION, search_string, re.IGNORECASE)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(?i) is already in the pattern. Also, this function should return a boolean value.

Suggested change
return re.search(ROMAN_REGEX_EXPRESSION, search_string, re.IGNORECASE)
return bool(re.search(ROMAN_REGEX_EXPRESSION, search_string))



def parse_number(input_string, language=None, numeral_systems=None):
"""Converts a single number written in natural language to a numeric type"""
if not input_string.strip():
return None
Expand All @@ -252,20 +259,35 @@ def parse_number(input_string, language=None):
if language is None:
language = _valid_tokens_by_language(input_string)

lang_data = LanguageData(language)
if numeral_systems is None:
if _is_roman(input_string):
numeral_systems = ['roman']

elif language in SUPPORTED_LANGUAGES and not _is_roman(input_string):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not _is_roman(input_string) will always be true, given the preceding if statement.

Suggested change
elif language in SUPPORTED_LANGUAGES and not _is_roman(input_string):
elif language in SUPPORTED_LANGUAGES:

Also, I am probably forgetting something, but why are we forcing decimal for any supported language? I could understand doing it for en given I is an English word, but I don’t think we need to avoid Roman numerals in Spanish, for example.

numeral_systems = ['decimal']

for numeral_system in numeral_systems:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will fail if numeral_systems is None.

if numeral_system == 'decimal':
lang_data = LanguageData(language)

tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
for index, token in enumerate(normalized_tokens):
if _is_cardinal_token(token, lang_data) or not token.strip():
continue
if _is_skip_token(token, lang_data) and index != 0:
continue
return None
number_built = _build_number(normalized_tokens, lang_data)
if len(number_built) == 1:
return int(number_built[0])
return None
Comment on lines +280 to +284
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those return None seem like an issue now within a for loop. It looks like they will prevent the for loop to reach the next iteration (for the next numeral system).

Maybe you could move this code into a _parse_decimal function, and only return if the result is not None, else let the for loop go to the next numeral system.


tokens = _tokenize(input_string, language)
normalized_tokens = _normalize_tokens(tokens)
for index, token in enumerate(normalized_tokens):
if _is_cardinal_token(token, lang_data) or not token.strip():
continue
if _is_skip_token(token, lang_data) and index != 0:
continue
return None
number_built = _build_number(normalized_tokens, lang_data)
if len(number_built) == 1:
return int(number_built[0])
return None
elif numeral_system == 'roman':
return int(_parse_roman(input_string))

else:
raise ValueError(f'{numeral_system!r} is not a supported numeral system')


def parse_fraction(input_string, language=None):
Expand Down Expand Up @@ -298,14 +320,33 @@ def parse_fraction(input_string, language=None):
return None


def parse(input_string, language=None):
def parse(input_string, language=None, numeral_systems=None):
"""
Converts all the numbers in a sentence written in natural language to their numeric type while keeping
the other words unchanged. Returns the transformed string.
"""
if numeral_systems is None:
numeral_systems = NUMERAL_SYSTEMS

if language is None:
language = _valid_tokens_by_language(input_string)

result = input_string
for numeral_system in numeral_systems:

if numeral_system == 'decimal':
result = _parse_decimal(result, language)

elif numeral_system == 'roman':
result = _parse_roman(result)

else:
raise ValueError(f'"{numeral_system}" is not a supported numeral system')

return result


def _parse_decimal(input_string, language):
lang_data = LanguageData(language)

tokens = _tokenize(input_string, language)
Expand Down Expand Up @@ -359,8 +400,41 @@ def _build_and_add_number(pop_last_space=False):

_build_and_add_number()
current_sentence.append(token)

_build_and_add_number()

final_sentence.extend(current_sentence)
return ''.join(final_sentence).strip()


def _parse_roman(input_string):
tokens = _tokenize(input_string, None)
tokens = [item for item in tokens if item != '']
for token in tokens:
if _is_roman(token):
tokens[tokens.index(token)] = str(_build_roman(token))
final_sentence = ''.join(tokens)

return final_sentence


def _build_roman(roman_number):
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}

num_tokens = re.split(ROMAN_REGEX_EXPRESSION, roman_number, re.IGNORECASE)
num_tokens = [item for item in num_tokens if item != '']

built_num = 0

for num_token in num_tokens:

if re.search('iv|ix|xl|xc|cd|cm', num_token, re.IGNORECASE):
built_num += roman[num_token[1].lower()] - roman[num_token[0].lower()]

elif re.search('[XLVD][IXC]{1,4}', num_token, re.IGNORECASE):
built_num += roman[num_token[0].lower()] + (roman[num_token[1].lower()] * (len(num_token) - 1))

elif re.search('[ixcm]{1,4}|[vld]{1}', num_token, re.IGNORECASE):
built_num += roman[num_token[0].lower()] * len(num_token)

return built_num

2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_test_files(path, prefix):
def _test_files(path, language, is_ordinal=True):
fnx = parse_ordinal if is_ordinal else parse_number
for filename in get_test_files(path, f'{language}_'):
with open(filename, "r") as csv_file:
with open(filename, "r", encoding="utf8") as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
try:
Expand Down
Loading