presage 0.9.1
text2ngram.cpp
Go to the documentation of this file.
1
2/******************************************************
3 * Presage, an extensible predictive text entry system
4 * ---------------------------------------------------
5 *
6 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation, Inc.,
20 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 **********(*)*/
23
24
25#include "config.h"
26
27#include <iostream>
28#include <fstream>
29#include <vector>
30#include <list>
31#include <string>
32#include <map>
33
34#ifdef HAVE_UNISTD_H
35# include <unistd.h>
36#endif
37
38#ifdef HAVE_STDLIB_H
39# include <stdlib.h>
40#endif
41
42#include <getopt.h>
43#include <assert.h>
44
46#include "core/iso8859_1.h"
47#include "core/progress.h"
48
49#include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h"
50
51const std::string PROGRAM_NAME = "text2ngram";
52
53typedef std::list<std::string> NgramList;
54
55void usage();
56void version();
57
58int main(int argc, char* argv[])
59{
60 int next_option;
61
62 // Setup some defaults
63 // - default to generating 1-gram counts
64 int ngrams = 1;
65
66 // - default output to stdout (empty string signifies stdout)
67 std::string output;
68
69 const std::string TABBED_SEPARATED_VALUES = "tsv";
70 const std::string SQLITE = "sqlite";
71 // - default format is tabbed separated values
72 std::string format = TABBED_SEPARATED_VALUES;
73
74 // - default to case sensitive
75 bool lowercase = false;
76
77 // - default to no append
78 bool append = false;
79
80
81 // getopt structures
82 const char * const short_options = "n:o:f:alhv";
83 const struct option long_options[] =
84 {
85 { "ngrams", required_argument, 0, 'n' },
86 { "output", required_argument, 0, 'o' },
87 { "format", required_argument, 0, 'f' },
88 { "append", no_argument, 0, 'a' },
89 { "lowercase", no_argument, 0, 'l' },
90 { "help", no_argument, 0, 'h' },
91 { "version", no_argument, 0, 'v' },
92 { 0, 0, 0, 0 }
93 };
94
95 do {
96 next_option = getopt_long(argc,
97 argv,
98 short_options,
99 long_options,
100 NULL);
101
102 switch (next_option) {
103 case 'n': // --ngrams or -n option
104 if (atoi(optarg) > 0) {
105 ngrams = atoi(optarg);
106 } else {
107 usage();
108 }
109 break;
110 case 'o': // --output or -o option
111 output = optarg;
112 break;
113 case 'f': // --format or -f option
114 if (optarg == SQLITE
115 || optarg == TABBED_SEPARATED_VALUES) {
116 format = optarg;
117 } else {
118 std::cerr << "Unknown format " << optarg << std::endl << std::endl;
119 usage();
120 return -1;
121 }
122 break;
123 case 'a': // --append or -a option
124 // append mode
125 append = true;
126 break;
127 case 'l': // --lowercase or -l option
128 lowercase = true;
129 break;
130 case 'h': // --help or -h option
131 usage();
132 exit (0);
133 break;
134 case 'v': // --version or -v option
135 version();
136 exit (0);
137 break;
138 case '?': // unknown option
139 usage();
140 exit (0);
141 break;
142 case -1:
143 break;
144 default:
145 std::cerr << "Error: unhandled option." << std::endl;
146 exit(0);
147 }
148
149 } while (next_option != -1);
150
151
152 if ((argc - optind < 1)) {
153 usage();
154 return -1;
155 }
156
157
158 // ngramMap stores <token,count> pairs
159 std::map<NgramList, int> ngramMap;
160
161 for (int i = optind; i < argc; i++) {
162 // do the actual processing file by file
163 std::string token;
164 NgramList ngram;
165
166 // points to output file
167 // print out file information
168 std::cout << "Parsing " << argv[i] << "..."
169 << std::endl;
170
171 ProgressBar<char> progressBar;
172
173 // create tokenizer object and open input file stream
174 std::ifstream infile(argv[i]);
175 ForwardTokenizer tokenizer(infile,
176 " \f\n\r\t\v",
177 "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
178 tokenizer.lowercaseMode(lowercase);
179
180 // take care of first N-1 tokens
181 for (int i = 0; (i < ngrams - 1 && tokenizer.hasMoreTokens()); i++) {
182 ngram.push_back(tokenizer.nextToken());
183 }
184
185 while (tokenizer.hasMoreTokens()) {
186 // extract token from input stream
187 token = tokenizer.nextToken();
188
189 // update ngram with new token
190 ngram.push_back(token);
191
192 // update map with new token occurrence
193 ngramMap[ngram] = ngramMap[ngram] + 1;
194
195 // update progress bar
196 //progressBar(tokenizer.progress());
197 progressBar.update(tokenizer.progress());
198
199 // remove front token from ngram
200 ngram.pop_front();
201 }
202
203 infile.close();
204 }
205
206
207 std::cout << "Writing out to " << format << " format file "
208 << output << "..." << std::endl;
209 if (format == TABBED_SEPARATED_VALUES) {
210 // output to tabbed separated values text file
211 //
212
213 std::ofstream *outstream = 0;
214 std::ostream *prev_outstream = 0;
215
216 if (output.c_str()) {
217 // tie outstream to file
218 outstream = new std::ofstream (output.c_str(), std::ios::out);
219 assert(outstream);
220 prev_outstream = std::cout.tie (outstream);
221 }
222
223 // write results to output stream
224 ProgressBar<char> progressBar;
225 long total = ngramMap.size();
226 long count = 0;
227 std::map<NgramList, int>::const_iterator it;
228 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229 for (NgramList::const_iterator ngram_it = it->first.begin();
230 ngram_it != it->first.end();
231 ngram_it++) {
232 std::cout << *ngram_it << '\t';
233 }
234 std::cout << it->second << std::endl;
235 progressBar.update(static_cast<double>(count++)/total);
236 }
237
238 if (output.c_str()) {
239 std::cout.tie (prev_outstream);
240 outstream->close ();
241 delete outstream;
242 }
243
244 } else if (format == SQLITE) {
245 // output to SQLITE
246 //
247
248 SqliteDatabaseConnector sqliteDbCntr(output, ngrams, true);
249 sqliteDbCntr.beginTransaction();
250 sqliteDbCntr.createNgramTable(ngrams);
251
252 // write results to output stream
253 ProgressBar<char> progressBar;
254 long total = ngramMap.size();
255 long count = 0;
256 std::map<NgramList, int>::const_iterator it;
257 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
258
259 // convert from NgramList to Ngram
260 Ngram ngram;
261 for (NgramList::const_iterator jt = it->first.begin();
262 jt != it->first.end();
263 jt++) {
264 ngram.push_back(*jt);
265 }
266
267 if (append) {
268 // need to check whether ngram is already in database.
269 // when appending to existing database
270 int count = sqliteDbCntr.getNgramCount(ngram);
271 if (count > 0) {
272 // ngram already in database, update count
273 sqliteDbCntr.updateNgram(ngram, count + it->second);
274 } else {
275 // ngram not in database, insert it
276 sqliteDbCntr.insertNgram(ngram, it->second);
277 }
278 } else {
279 // insert ngram
280 sqliteDbCntr.insertNgram(ngram, it->second);
281 }
282
283 progressBar.update(static_cast<double>(count++)/total);
284 }
285 sqliteDbCntr.endTransaction();
286 } else {
287 abort();
288 }
289
290
291 std::cout << std::endl;
292
293 return 0;
294}
295
296
298{
299 std::cout
300 << PROGRAM_NAME << " (" << PACKAGE << ") version " << VERSION << std::endl
301 << "Copyright (C) Matteo Vescovi" << std::endl
302 << "This is free software; see the source for copying conditions. There is NO" << std::endl
303 << "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." << std::endl
304 << std::endl;
305}
306
307
308void usage()
309{
310 std::cout
311 << "Usage: " << PROGRAM_NAME << " [OPTION]... infiles..." << std::endl
312 << std::endl
313 << " --output, -o O " << "Output file name O" << std::endl
314 << " --ngrams, -n N " << "Specify ngram cardinality N" << std::endl
315 << " --format, -f F " << "Output file format F: sqlite, tsv (tabbed separated values)" << std::endl
316 << " --lowercase, -l " << "Enable lowercase conversion mode" << std::endl
317 << " --append, -a " << "Open output file in append mode" << std::endl
318 << " --help, -h " << "Display this information" << std::endl
319 << " --version, -v " << "Show version information" << std::endl
320 << std::endl
321 << PROGRAM_NAME << " is free software distributed under the GPL." << std::endl
322 << "Send bug reports to " << PACKAGE_BUGREPORT << std::endl
323 << "Copyright (C) Matteo Vescovi" << std::endl;
324}
virtual void endTransaction() const
void createNgramTable(const size_t cardinality) const
virtual void beginTransaction() const
void insertNgram(const Ngram ngram, const int count) const
int getNgramCount(const Ngram ngram) const
void updateNgram(const Ngram ngram, const int count) const
virtual bool hasMoreTokens() const
virtual std::string nextToken()
virtual double progress() const
Definition: ngram.h:33
void update(const double percentage)
Definition: progress.h:54
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
Definition: logger.h:278
int main(int argc, char *argv[])
Definition: text2ngram.cpp:58
std::list< std::string > NgramList
Definition: text2ngram.cpp:53
void usage()
Definition: text2ngram.cpp:308
const std::string PROGRAM_NAME
Definition: text2ngram.cpp:51
void version()
Definition: text2ngram.cpp:297