13 #ifndef _HFST_TOKENIZER_H_
14 #define _HFST_TOKENIZER_H_
27 using hfst::StringSet;
36 class MultiCharSymbolTrie;
37 typedef std::vector<MultiCharSymbolTrie*> MultiCharSymbolTrieVector;
38 typedef std::vector<bool> SymbolEndVector;
40 class MultiCharSymbolTrie
43 MultiCharSymbolTrieVector symbol_rests;
44 SymbolEndVector is_leaf;
45 bool is_end_of_string(
const char * p)
const ;
46 void set_symbol_end(
const char * p);
47 void init_symbol_rests(
const char * p);
48 void add_symbol_rest(
const char * p);
49 bool is_symbol_end(
const char * p)
const;
50 MultiCharSymbolTrie * get_symbol_rest_trie(
const char * p)
const;
53 MultiCharSymbolTrie(
void);
54 ~MultiCharSymbolTrie(
void);
55 void add(
const char * p);
56 const char * find(
const char * p)
const;
88 MultiCharSymbolTrie multi_char_symbols;
89 StringSet skip_symbol_set;
90 int get_next_symbol_size(
const char * symbol)
const;
91 bool is_skip_symbol(
String &s)
const;
129 const std::string &output_string)
const;
std::pair< String, String > StringPair
A symbol pair in a transition.
Definition: HfstSymbolDefs.h:71
StringPairVector tokenize(const std::string &input_string) const
Tokenize the string input_string.
std::string String
A UTF-8 symbol in a transition.
Definition: HfstSymbolDefs.h:60
std::vector< std::pair< std::string, std::string > > StringPairVector
A vector of string pairs.
Definition: HfstDataTypes.h:106
HfstTokenizer()
Create a tokenizer that recognizes utf-8 symbols.
Definition: HfstTokenizer.cc:83
void add_multichar_symbol(const std::string &symbol)
Add a multicharacter symbol symbol to this tokenizer.
Definition: HfstTokenizer.cc:113
void add_skip_symbol(const std::string &symbol)
Add a symbol to be skipped to this tokenizer.
Definition: HfstTokenizer.cc:117
StringVector tokenize_one_level(const std::string &input_string) const
Tokenize the string input_string.
Definition: HfstTokenizer.cc:142
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:97
static void check_utf8_correctness(const std::string &input_string)
If input_String is not valid utf-8, throw an IncorrectUtf8CodingException.
Definition: HfstTokenizer.cc:237
A tokenizer for creating transducers from UTF-8 strings.
Definition: HfstTokenizer.h:85
Typedefs and functions for symbols, symbol pairs and sets of symbols.
std::vector< std::string > StringVector
A vector of strings.
Definition: HfstDataTypes.h:88