-
Notifications
You must be signed in to change notification settings - Fork 0
/
Toker.cpp
98 lines (88 loc) · 1.85 KB
/
Toker.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include "Toker.h"
#include <string>
#include <fstream>
#include <streambuf>
#include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>
unsigned int global_doc_id=1;
Toker::Toker(std::string& filename)
:stopFile(filename),
isValid(true)
{
if (!initializeStopWords())
{
std::cout<<"Could not get stop words - check if file exists";
isValid=false;
}
}
Toker::~Toker()
{
}
bool
Toker::getIsValid()
{
return isValid;
}
bool
Toker::initializeStopWords()
{
std::ifstream stopwords(stopFile.c_str());
if (!stopwords)
{
printf("Error:couldn't get stop words\n");
return false;
}
char word[256];
memset(word,'\0',256);
while (stopwords.getline(word,256,'\n'))
{
if(!word[0])
continue;
stops.push_back(std::string(word));
memset(word,'\0',256);
}
return true;
}
bool
Toker::tokenize(std::string& filename,LinkedList& tokens,AttrMap& atMap)
{
std::ifstream f(filename.c_str());
std::string tokenstring;
if (f)
{
std::string tempFill((std::istreambuf_iterator<char>(f)),
std::istreambuf_iterator<char>());
tokenstring=tempFill.c_str();
}
else
{
std::cout<<"Could not open file for reading\n";
return false;
}
Tokenizer tok(tokenstring);
removeStops(tok,tokens,atMap);
return true;
}
bool
Toker::removeStops(Tokenizer& tok,LinkedList& tokens,AttrMap& atMap)
{
LinkedList::const_iterator it =tokens.begin();
for(Tokenizer::iterator iter=tok.begin();iter!=tok.end();++iter)
{
std::string temp(*iter);
boost::algorithm::to_lower(temp);
if(std::find(stops.begin(),stops.end(),temp)==stops.end())
{
if(std::find(tokens.begin(),tokens.end(),temp)!=tokens.end())
{
TermAttributes* ta=atMap[temp];
ta->increaseCount();
}
else
{
tokens.insert_after(it,temp);
atMap[temp]=new TermAttributes();
}
}
}
}