HFST - Helsinki Finite-State Transducer Technology API  version 3.7.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HfstTokenizer.h
Go to the documentation of this file.
1 // This program is free software: you can redistribute it and/or modify
2 // it under the terms of the GNU General Public License as published by
3 // the Free Software Foundation, version 3 of the License.
4 //
5 // This program is distributed in the hope that it will be useful,
6 // but WITHOUT ANY WARRANTY; without even the implied warranty of
7 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
8 // GNU General Public License for more details.
9 //
10 // You should have received a copy of the GNU General Public License
11 // along with this program. If not, see <http://www.gnu.org/licenses/>.
12 
13 #ifndef _HFST_TOKENIZER_H_
14 #define _HFST_TOKENIZER_H_
15 #include "HfstSymbolDefs.h"
16 #include "HfstExceptionDefs.h"
17 #include <iostream>
18 #include <climits>
19 #include <string>
20 
24 namespace hfst
25 {
26  using hfst::String;
27  using hfst::StringSet;
28  using hfst::StringPair;
30 
31  // Copied from HfstDataTypes.h because including the file
32  // causes problems with header file #ifndef _HEADER_FILE_H_ guards
33  typedef std::vector<std::string> StringVector;
34  typedef std::pair<float,StringVector> HfstOneLevelPath;
35 
36  class MultiCharSymbolTrie;
37  typedef std::vector<MultiCharSymbolTrie*> MultiCharSymbolTrieVector;
38  typedef std::vector<bool> SymbolEndVector;
39 
40  class MultiCharSymbolTrie
41  {
42  private:
43  MultiCharSymbolTrieVector symbol_rests;
44  SymbolEndVector is_leaf;
45  bool is_end_of_string(const char * p) const ;
46  void set_symbol_end(const char * p);
47  void init_symbol_rests(const char * p);
48  void add_symbol_rest(const char * p);
49  bool is_symbol_end(const char * p) const;
50  MultiCharSymbolTrie * get_symbol_rest_trie(const char * p) const;
51 
52  public:
53  MultiCharSymbolTrie(void);
54  ~MultiCharSymbolTrie(void);
55  void add(const char * p);
56  const char * find(const char * p) const;
57  };
58 
86  {
87  private:
88  MultiCharSymbolTrie multi_char_symbols;
89  StringSet skip_symbol_set;
90  int get_next_symbol_size(const char * symbol) const;
91  bool is_skip_symbol(String &s) const;
92 
93  public:
94 
96  HfstTokenizer();
97 
104  void add_skip_symbol(const std::string &symbol);
105 
112  void add_multichar_symbol(const std::string& symbol);
113 
115  StringPairVector tokenize(const std::string &input_string) const;
116 
118  StringVector tokenize_one_level(const std::string &input_string) const;
119 
120  static StringPairVector tokenize_space_separated(const std::string & str);
121 
128  StringPairVector tokenize(const std::string &input_string,
129  const std::string &output_string) const;
130 
148  static void check_utf8_correctness(const std::string &input_string);
149  };
150 }
151 #endif
std::pair< String, String > StringPair
A symbol pair in a transition.
Definition: HfstSymbolDefs.h:71
StringPairVector tokenize(const std::string &input_string) const
Tokenize the string input_string.
std::string String
A UTF-8 symbol in a transition.
Definition: HfstSymbolDefs.h:60
std::vector< std::pair< std::string, std::string > > StringPairVector
A vector of string pairs.
Definition: HfstDataTypes.h:106
A file for exceptions.
HfstTokenizer()
Create a tokenizer that recognizes utf-8 symbols.
Definition: HfstTokenizer.cc:83
void add_multichar_symbol(const std::string &symbol)
Add a multicharacter symbol symbol to this tokenizer.
Definition: HfstTokenizer.cc:113
void add_skip_symbol(const std::string &symbol)
Add a symbol to be skipped to this tokenizer.
Definition: HfstTokenizer.cc:117
StringVector tokenize_one_level(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:142
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:97
static void check_utf8_correctness(const std::string &input_string)
If input_String is not valid utf-8, throw an IncorrectUtf8CodingException.
Definition: HfstTokenizer.cc:237
A tokenizer for creating transducers from UTF-8 strings.
Definition: HfstTokenizer.h:85
Typedefs and functions for symbols, symbol pairs and sets of symbols.
std::vector< std::string > StringVector
A vector of strings.
Definition: HfstDataTypes.h:88