forked from andybalholm/redwood
-
Notifications
You must be signed in to change notification settings - Fork 0
/
phrase.go
155 lines (138 loc) · 3.94 KB
/
phrase.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package main
// content phrase matching, using the Aho-Corasick algorithm
// A phraseNode is a node in the trie for scanning for phrases.
type phraseNode struct {
match string // the phrase matched at this point, if any
children []int32 // array indexes to continue searching at (128 or 256 elements, one for each possible next byte)
// onlyChild is used instead of children if it would contain only one
// nonzero entry.
onlyChild int32
onlyChildByte byte
// fallback is the index of the phraseNode that has the longest suffix in common with this node.
// It is set by findFallbackNodes.
fallback int32
}
// child returns the index of the node to transition to if the next byte is b.
func (n *phraseNode) child(b byte) int32 {
if int(b) < len(n.children) {
return n.children[b]
}
if b == n.onlyChildByte {
return n.onlyChild
}
return 0
}
// setChild sets the index to transition to when the byte read is b.
func (n *phraseNode) setChild(b byte, next int32) {
if int(b) < len(n.children) {
n.children[b] = next
return
}
if n.children != nil {
// children has only 128 elements, and b is a high-bit byte.
newChildren := make([]int32, 256)
copy(newChildren, n.children)
n.children = newChildren
n.children[b] = next
return
}
if n.onlyChild == 0 {
n.onlyChild = next
n.onlyChildByte = b
return
}
if b < 128 && n.onlyChildByte < 128 {
n.children = make([]int32, 128)
} else {
n.children = make([]int32, 256)
}
n.children[n.onlyChildByte] = n.onlyChild
n.children[b] = next
}
// A phraseList is a trie of phrase rules (with fallback pointers for
// Aho-Corasick).
// It acts as a state table for the state machine that scans the page content.
// It is a byte-by-byte trie in UTF-8 encoding.
type phraseList []phraseNode
func newPhraseList() phraseList {
p := make([]phraseNode, 1, 500)
return p
}
// addPhrase adds a phrase to p.
func (p *phraseList) addPhrase(s string) {
n := int32(0) // index into phraseTrie
for i := 0; i < len(s); i++ {
c := s[i]
node := &(*p)[n]
next := node.child(c)
if next == 0 {
next = int32(len(*p))
node.setChild(c, next)
*p = append(*p, phraseNode{})
}
n = next
}
(*p)[n].match = s
}
// findFallbackNodes traverses the phrase trie and sets each node's fallback
// pointer. node is the node to use as the root of the search,
// and text is the bytes that would take the scanner there from the root of the
// trie. For the root node, node == 0 and text == nil.
func (p *phraseList) findFallbackNodes(node int32, text []byte) {
v := &(*p)[node]
// Find this node's fallback node.
for i := 1; i < len(text); i++ {
f := int32(0) // If there is no suffix in common, use the root.
for j := i; j < len(text); j++ {
f = (*p)[f].child(text[j])
if f == 0 {
break
}
}
if f != 0 {
v.fallback = f
break
}
}
// Traverse this node's children.
switch {
case v.children != nil:
for c, n := range v.children {
if n != 0 {
p.findFallbackNodes(n, append(text, byte(c)))
}
}
case v.onlyChild != 0:
p.findFallbackNodes(v.onlyChild, append(text, v.onlyChildByte))
}
}
// A phraseScanner scans input one byte at a time
// and calls callback with the matching phrases.
type phraseScanner struct {
list phraseList
currentNode int32 // the current node in the phraseList
callback func(string)
}
func newPhraseScanner(list phraseList, callback func(string)) *phraseScanner {
return &phraseScanner{
list: list,
callback: callback,
}
}
// scanByte updates ps for one byte of input.
func (ps *phraseScanner) scanByte(c byte) {
// Find the new current node.
currentNode := ps.currentNode
newState := ps.list[currentNode].child(c)
for currentNode != 0 && newState == 0 {
currentNode = ps.list[currentNode].fallback
newState = ps.list[currentNode].child(c)
}
ps.currentNode = newState
// See if any phrases have been matched.
for n := newState; n != 0; n = ps.list[n].fallback {
if m := ps.list[n].match; m != "" {
ps.callback(m)
}
}
}