-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence_parser.py
137 lines (96 loc) · 3.39 KB
/
sentence_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#coding: utf-8
# This file is part of https://github.com/marcus67/rechtschreibung
import inspect
import importlib
import words
import rulesets
import log
import rule_decorator
import util
global logger
def print_rules():
for rule in rule_decorator.get_rules():
print (str(rule))
def get_single_char_pattern_map(p_rules):
global logger
map = {}
for rule in p_rules:
if len(rule.pattern) == 1:
existing_rule = map.get(rule.pattern)
if existing_rule is not None and existing_rule.seperates_words != rule.seperates_words:
fmt = "Pattern '%s' occors more than once! Ignoring %s." % ( rule.pattern, str(rule.name) )
logger.warning(fmt)
else:
map[rule.pattern] = rule
return map
def parse_string(p_string):
global logger
s = p_string
cond = rule_decorator.COND_BOS | rule_decorator.COND_BOW
rules = rule_decorator.get_rules()
single_char_pattern_map = get_single_char_pattern_map(rules)
result = []
while len(s) > 0:
best_rule = None
best_cond = None
best_cond_count = -1
last_pattern_length = None
for rule in rules:
current_pattern_length = len(rule.pattern)
if current_pattern_length <= len(s):
temp_cond = rule_decorator.COND_NONE
if last_pattern_length is None or last_pattern_length != current_pattern_length:
if best_rule is not None and last_pattern_length is not None:
break
if len(s) == current_pattern_length:
temp_cond = temp_cond | rule_decorator.COND_EOS
else:
next_char = rulesets.to_lower(s[current_pattern_length])
next_char_rule = single_char_pattern_map.get(next_char)
if next_char_rule is None:
fmt = "Character '%s' not found as pattern!" % next_char
logger.warn(fmt)
elif next_char_rule.seperates_words:
temp_cond = temp_cond | rule_decorator.COND_EOW
temp_cond = temp_cond | cond
compare_s = s[0:current_pattern_length]
first_letter = compare_s[0]
if (rulesets.to_upper(first_letter) == first_letter and
rulesets.to_upper(first_letter) != rulesets.to_lower(first_letter)):
temp_cond = temp_cond | rule_decorator.COND_CAPITALIZED
compare_s = rulesets.string_to_lower(compare_s)
logger.debug("compare %s to %s demanding %d with current condition %d" % (
compare_s, rule.pattern, rule.condition, temp_cond))
condition_count = util.count_bits(rule.condition)
if (compare_s == rule.pattern and
(rule.condition & temp_cond) == rule.condition and
condition_count > best_cond_count):
best_rule = rule
best_cond = temp_cond
best_cond_count = condition_count
if best_rule is None:
fmt = "No rule found for remaining string '%s'" % s
logger.error(fmt)
break
else:
result.append(best_rule.build_string(best_cond))
if best_rule.seperates_words:
cond = cond | rule_decorator.COND_BOW
else:
cond = cond & ~rule_decorator.COND_BOW
s = s[len(best_rule.pattern):]
cond = cond & ~ rule_decorator.COND_BOS
last_pattern_length = current_pattern_length
return u"+".join(result)
def setup_logging():
global logger
logger = log.open_logging('parser', reload=True)
def main():
global logger
setup_logging()
logger.info("Start parser")
sentence = input("Enter sentence:")
print (parse_string(sentence))
logger.info("End parser")
if __name__ == '__main__':
main()