HFST - Helsinki Finite-State Transducer Technology API  version 3.7.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
convert.h
1 // This program is free software: you can redistribute it and/or modify
2 // it under the terms of the GNU General Public License as published by
3 // the Free Software Foundation, version 3 of the License.
4 //
5 // This program is distributed in the hope that it will be useful,
6 // but WITHOUT ANY WARRANTY; without even the implied warranty of
7 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
8 // GNU General Public License for more details.
9 //
10 // You should have received a copy of the GNU General Public License
11 // along with this program. If not, see <http://www.gnu.org/licenses/>.
12 
13 #ifndef _HFST_OL_CONVERT_H_
14 #define _HFST_OL_CONVERT_H_
15 
16 #if HAVE_OPENFST
17 #include "fst/fstlib.h"
18 #endif // HAVE_OPENFST
19 
20 #include "transducer.h"
21 #include "pmatch.h"
22 
23 namespace hfst_ol {
24 
25  typedef std::map<hfst_ol::TransitionTableIndex,unsigned int>
26  HfstOlToBasicStateMap;
27 
28 struct TransitionPlaceholder {
29  unsigned int target;
30  SymbolNumber output;
31  float weight;
32 
33 TransitionPlaceholder(unsigned int t, SymbolNumber o, float w):
34  target(t),
35  output(o),
36  weight(w)
37  {}
38 };
39 
40 typedef std::map<SymbolNumber, std::vector<TransitionPlaceholder> >
41  SymbolTransitionsMap;
42 
43 
44 struct StatePlaceholder {
45  unsigned int state_number;
46  unsigned int start_index;
47  unsigned int first_transition;
48  SymbolTransitionsMap inputs;
49  bool final;
50  float final_weight;
51  StatePlaceholder (unsigned int state, bool finality, unsigned int first,
52  Weight final_weight):
53  state_number(state),
54  start_index(UINT_MAX),
55  first_transition(first),
56  final(finality),
57  final_weight(final_weight)
58  {}
59  StatePlaceholder ():
60  state_number(UINT_MAX),
61  start_index(UINT_MAX),
62  first_transition(UINT_MAX),
63  final(false),
64  final_weight(0.0)
65  { }
66 
67  bool is_simple(std::set<SymbolNumber> const & flag_symbols) const
68  {
69  if (state_number == 0) {
70  return false;
71  }
72  if (flag_symbols.size() == 0) {
73  return inputs.size() < 2;
74  }
75  bool have_zero = false;
76  SymbolNumber input_symbols = 0;
77  for(SymbolTransitionsMap::const_iterator it = inputs.begin();
78  it != inputs.end(); ++it) {
79  if ((it->first == 0) or (flag_symbols.count(it->first) != 0)) {
80  if (!have_zero) {
81  have_zero = true;
82  ++input_symbols;
83  }
84  } else {
85  ++input_symbols;
86  }
87  if (input_symbols > 1) {
88  return false;
89  }
90  }
91  return true;
92  }
93 
94  unsigned int number_of_transitions(void) const {
95  unsigned int count = 0;
96  for(SymbolTransitionsMap::const_iterator it = inputs.begin();
97  it != inputs.end(); ++it) {
98  count += it->second.size();
99  }
100  return count;
101  }
102 
103  unsigned int symbol_offset(
104  SymbolNumber const symbol,
105  std::set<SymbolNumber> const & flag_symbols) const {
106  if (symbol == 0) {
107  return 0;
108  }
109  unsigned int offset = 0;
110  if (flag_symbols.size() == 0) {
111  for(SymbolTransitionsMap::const_iterator it = inputs.begin();
112  it!= inputs.end(); ++it) {
113  if (symbol == it->first) {
114  return offset;
115  }
116  offset += it->second.size();
117  }
118 
119  } else {
120  if (inputs.count(0) != 0) { // if there are epsilons
121  offset = inputs.find(0)->second.size();
122  }
123  for(std::set<SymbolNumber>::iterator flag_it = flag_symbols.begin();
124  flag_it != flag_symbols.end(); ++flag_it) {
125  if (inputs.count(*flag_it) != 0) { // if this flag is present
126  if (symbol == *flag_it) {
127  // Flags go to 0 (even if there's no epsilon)
128  return 0;
129  }
130  offset += inputs.find(*flag_it)->second.size();
131  }
132  }
133  for(SymbolTransitionsMap::const_iterator it = inputs.begin();
134  it!= inputs.end(); ++it) {
135  if (it->first == 0 || flag_symbols.count(it->first) != 0) {
136  continue;
137  }
138  if (symbol == it->first) {
139  return offset;
140  }
141  offset += it->second.size();
142  }
143  std::string message("error in conversion between optimized lookup "
144  "format and HfstTransducer;\ntried to calculate "
145  "symbol_offset for symbol not present in state");
148  message);
149  }
150  std::string message("error in function StatePlaceholder::symbol_offset");
152  }
153 };
154 
155 bool compare_states_by_input_size(
156  const StatePlaceholder & lhs, const StatePlaceholder & rhs);
157 bool compare_states_by_state_number(
158  const StatePlaceholder & lhs, const StatePlaceholder & rhs);
159 
160 class IndexPlaceholders: public std::map<unsigned int,
161  std::pair<unsigned int, SymbolNumber> >
162 {
163 public:
164  bool fits(StatePlaceholder const & state,
165  std::set<SymbolNumber> const & flag_symbols,
166  unsigned int const position) const
167  {
168  if (count(position) != 0) {
169  return false;
170  }
171  for (SymbolTransitionsMap::const_iterator it = state.inputs.begin();
172  it != state.inputs.end(); ++it) {
173  SymbolNumber index_offset = it->first;
174  if (flag_symbols.count(index_offset) != 0) {
175  index_offset = 0;
176  }
177  if (count(index_offset + position + 1) != 0) {
178  return false;
179  }
180  }
181  return true;
182  }
183 
184  bool unsuitable(unsigned int const index,
185  SymbolNumber const symbols,
186  float const packing_aggression) const
187  {
188  if (count(index) != 0) {
189  return true;
190  }
191 
192  // "Perfect packing" (under this strategy)
193 /* for (unsigned int i = 0; i < symbols; ++i) {
194 
195  if (count(index + i) == 0) {
196  return true;
197  }
198  return false;
199  }*/
200 
201  unsigned int filled = 0;
202  for (unsigned int i = 0; i < symbols; ++i) {
203  filled += count(index + i + 1);
204  if (filled >= (packing_aggression*symbols)) {
205  return true; // too full
206  }
207  }
208  return false;
209  }
210 };
211 
212 
213 void write_transitions_from_state_placeholders(
214  TransducerTable<TransitionW> & transition_table,
215  std::vector<hfst_ol::StatePlaceholder>
216  & state_placeholders,
217  std::set<SymbolNumber> & flag_symbols);
218 
219 void add_transitions_with(SymbolNumber symbol,
220  std::vector<TransitionPlaceholder> & transitions,
221  TransducerTable<TransitionW> & transition_table,
222  std::vector<hfst_ol::StatePlaceholder>
223  & state_placeholders,
224  std::set<SymbolNumber> & flag_symbols);
225 
226 #if HAVE_OPENFST // Covers remainder of file
227 typedef fst::StdArc::StateId StateId;
228 typedef fst::StdArc StdArc;
229 typedef fst::StdVectorFst TransduceR;
230 typedef fst::ArcIterator<TransduceR> ArcIterator;
231 typedef std::set<StateId> StateIdSet;
232 typedef std::set<int64> OfstSymbolSet;
233 
234 const StateIdNumber NO_ID_NUMBER = UINT_MAX;
235 const StateId NO_STATE_ID = UINT_MAX;
236 const SymbolNumber BIG_STATE_LIMIT = 1;
237 
238 
239 struct transition_label
240 {
241  int64 input_symbol;
242  int64 output_symbol;
243 };
244 
245 struct compare_transition_labels
246 {
247  bool operator() ( const transition_label &l1,
248  const transition_label &l2) const
249  {
250  if (l1.input_symbol == l2.input_symbol)
251  return l1.output_symbol < l2.output_symbol;
252  return l1.input_symbol < l2.input_symbol;
253  }
254 };
255 
256 typedef std::set<transition_label,compare_transition_labels> LabelSet;
257 
258 enum place_holder {EMPTY, EMPTY_START, OCCUPIED_START, OCCUPIED };
259 typedef std::vector<place_holder> PlaceHolderVector;
260 
261 /*
262  A class which can translate between StateId and StateIdNumbers
263 */
264 class ConvertIdNumberMap
265 {
266 private:
267  typedef std::map<StateIdNumber,StateId> IdNumbersToStateIds;
268  typedef std::map<StateId,StateIdNumber> StateIdsToIdNumbers;
269 
270  IdNumbersToStateIds id_to_node;
271  StateIdsToIdNumbers node_to_id;
272 
273  StateIdNumber node_counter;
274 
275  void add_node(StateId n, TransduceR *tr);
276  /*
277  Assign every node n in t a unique id number i. Assign the start node
278  t->root_node() id number 0. Set node_to_id[n] = i and
279  id_to_node[i] = n.
280  */
281  void set_node_maps(TransduceR * t);
282 
283 public:
284  ConvertIdNumberMap(TransduceR * t):
285  node_counter(0)
286  { set_node_maps(t); }
287 
288  StateIdNumber get_number_of_nodes(void) const
289  { return node_counter; }
290 
291  StateIdNumber get_node_id(StateId n) const;
292  StateId get_id_node(StateIdNumber i) const;
293 };
294 
295 typedef std::map<int64,unsigned int> OfstSymbolCountMap;
296 typedef std::set<std::string> SymbolSet;
297 
298 class ConvertTransducerAlphabet
299 {
300  private:
301  SymbolTable symbol_table;
302 
303  TransduceR* transducer;
304  fst::SymbolTable * ofst_symbol_table;
305  // input and output symbol tables together
306 
307  std::map<int64, SymbolNumber> input_symbols_map;
308  std::map<int64, SymbolNumber> output_symbols_map;
309 
310  void inspect_node(StateId n, StateIdSet& visited_nodes,
311  OfstSymbolCountMap& symbol_count_map, SymbolSet& all_symbol_set);
312  void get_symbol_info(
313  OfstSymbolCountMap &symbol_count_map, SymbolSet& all_symbol_set);
314  void populate_symbol_table(
315  OfstSymbolCountMap &input_symbol_counts, SymbolSet& all_symbol_set);
316  void set_maps();
317  public:
318  ConvertTransducerAlphabet(TransduceR* t);
319 
320  void display() const;
321 
322  const SymbolTable& get_symbol_table() const {return symbol_table;}
323  SymbolNumber lookup_ofst_input_symbol(int64 s) const;
324  SymbolNumber lookup_ofst_output_symbol(int64 s) const;
325  bool is_flag_diacritic(SymbolNumber symbol) const;
326 
327  TransducerAlphabet to_alphabet() const;
328 };
329 
330 class ConvertTransition
331 {
332 private:
333  SymbolNumber input_symbol;
334  SymbolNumber output_symbol;
335  // initially we only know the StateIdNumber of the target, but once the
336  // tables have been laid out we can just store the state's table index
337  union
338  {
339  StateIdNumber target_state_id;
340  TransitionTableIndex target_state_index;
341  };
342  Weight weight;
343 
344  TransitionTableIndex table_index; // location in the transition table
345 public:
346  /*
347  Set the symbol numbers and set the indices of the states according
348  to ConvertIdNumberMap nodes_to_id_numbers.
349  */
350  ConvertTransition(const StdArc &a);
351 
352  void display() const;
353 
354  SymbolNumber get_input_symbol(void) const
355  { return input_symbol; }
356 
357  void set_target_state_index();
358 
359  void set_table_index(TransitionTableIndex i)
360  { table_index = i; }
361 
362  TransitionTableIndex get_table_index(void) const
363  { return table_index; }
364 
365  template<class T> T to_transition() const;
366 
367  bool numerical_cmp( const ConvertTransition &another_transition) const;
368  bool operator<(const ConvertTransition &another_index) const;
369 };
370 
371 class ConvertTransitionIndex
372 {
373 private:
374  SymbolNumber input_symbol;
375  // initially we only have the corresponding transition object, but once the
376  // tables have been laid out we can just store the transition's table index
377  union
378  {
379  ConvertTransition* first_transition;
380  TransitionTableIndex first_transition_index;
381  };
382 public:
383  ConvertTransitionIndex(SymbolNumber input, ConvertTransition* transition):
384  input_symbol(input), first_transition(transition) {}
385 
386  void display() const;
387 
388  SymbolNumber get_input_symbol(void) const
389  { return input_symbol; }
390 
391  ConvertTransition* get_first_transition() const
392  { return first_transition; }
393 
394  void set_first_transition_index(TransitionTableIndex i)
395  { first_transition_index = i; }
396 
397  template<class T> T to_transition_index() const;
398 
399  bool operator<(const ConvertTransitionIndex &another_index) const;
400 };
401 
402 struct ConvertTransitionCompare
403 {
404  bool operator() (const ConvertTransition * t1,
405  const ConvertTransition * t2) const
406  {
407  return t1->operator<(*t2);
408  }
409 };
410 
411 struct ConvertTransitionIndexCompare
412 {
413  bool operator() (const ConvertTransitionIndex * i1,
414  const ConvertTransitionIndex * i2) const
415  {
416  return i1->operator<(*i2);
417  }
418 };
419 
420 typedef std::set<ConvertTransition*,ConvertTransitionCompare>
421  ConvertTransitionSet;
422 typedef std::set<ConvertTransitionIndex*,ConvertTransitionIndexCompare>
423  ConvertTransitionIndexSet;
424 
425 class ConvertFstState
426 {
427 private:
428  ConvertTransitionSet transitions;
429  ConvertTransitionIndexSet transition_indices;
430 
431  // the location in the transition table of the state's first transition. For
432  // states without transitions, this is where the first transition *would* be
433  TransitionTableIndex first_transition_index;
434  // the location in the table tables where the state's transition indices
435  // begin the state's location in the index tables. This can be either in the
436  // transition index table, or, if the state only has transitions with
437  // one input symbol, in the transition table, immediately preceding
438  // the first transition
439  TransitionTableIndex table_index;
440 
441  bool final;
442  Weight weight;
443 
444  StateIdNumber id;
445 
446  void set_transitions(StateId n, TransduceR * tr);
447  void set_transition_indices(void);
448 
449  friend class fst_state_compare;
450 public:
451  ConvertFstState(StateId n, TransduceR * tr);
452  ~ConvertFstState();
453 
454  void display() const;
455 
456  TransitionTableIndex set_transition_table_indices(
457  TransitionTableIndex place);
458 
459  TransitionTableIndex get_first_transition_index() const
460  { return first_transition_index; }
461 
462  void set_table_index(TransitionTableIndex i)
463  { table_index = i; }
464  TransitionTableIndex get_table_index(void) const
465  { return table_index; }
466 
467  SymbolNumberSet * get_input_symbols(void) const;
468 
469  SymbolNumber number_of_input_symbols(void) const
470  { return transition_indices.size(); }
471  SymbolNumber number_of_transitions(void) const
472  { return transitions.size(); }
473  bool is_final(void) const {return final;}
474  bool is_big_state(void) const
475  {
476  return (transition_indices.size() > BIG_STATE_LIMIT);
477  }
478  bool is_start_state(void) const {return id == 0;}
479  StateIdNumber get_id(void) const {return id;}
480 
481  // update transitions with the state's location in the tables
482  void set_transition_target_indices();
483 
484  // add this state's transition indices into the given transition index table
485  template<class T>
486  void insert_transition_indices(TransducerTable<T>& index_table) const;
487 
488  template<class T>
489  TransitionTableIndex append_transitions(TransducerTable<T>& transition_table,
490  TransitionTableIndex place) const;
491 };
492 
493  typedef std::vector<ConvertFstState*> ConvertFstStateVector;
494 
495 class ConvertTransitionTableIndices
496 {
497 private:
498  PlaceHolderVector indices;
499  PlaceHolderVector::size_type lower_bound;
500  unsigned int lower_bound_test_count;
501  SymbolNumber number_of_input_symbols;
502 
503  bool state_fits(SymbolNumberSet * input_symbols,
504  bool final_state,
505  PlaceHolderVector::size_type index);
506 
507  void insert_state(SymbolNumberSet * input_symbols,
508  bool final_state,
509  PlaceHolderVector::size_type index);
510  void get_more_space(void);
511 public:
512  ConvertTransitionTableIndices(SymbolNumber input_symbol_count):
513  lower_bound(0), lower_bound_test_count(0),
514  number_of_input_symbols(input_symbol_count)
515  {
516  get_more_space();
517  };
518 
519  PlaceHolderVector::size_type add_state(ConvertFstState * state);
520  PlaceHolderVector::size_type size(void) const
521  { return indices.size(); }
522 
523  PlaceHolderVector::size_type last_full_index(void) const;
524 };
525 
526 class ConvertTransducerHeader
527 {
528  private:
529  static void full_traversal(TransducerHeader& h, TransduceR* tr, StateId n,
530  StateIdSet& visited_nodes, StateIdSet& nodes_in_path,
531  OfstSymbolSet& all_input_symbols);
532  static void find_input_epsilon_cycles(StateId n, StateId t,
533  StateIdSet &epsilon_targets,
534  bool unweighted_only, TransduceR * tr,
535  TransducerHeader& h);
536  public:
537  static void compute_header(TransducerHeader& header,
538  TransduceR * t, SymbolNumber symbol_count,
539  TransitionTableIndex number_of_index_table_entries,
540  TransitionTableIndex number_of_target_table_entries,
541  bool weighted);
542 };
543 
544 class ConvertTransducer
545 {
546 private:
547  TransduceR * fst;
548  ConvertIdNumberMap * id_number_map;
549  ConvertTransitionTableIndices * fst_indices;
550  PlaceHolderVector::size_type index_table_size;
551 
552  TransducerHeader header;
553  ConvertTransducerAlphabet alphabet;
554  ConvertFstStateVector states;
555 
556  void read_nodes(void);
557  void set_transition_table_indices(void);
558  void set_index_table_indices(void);
559 
560  void add_input_symbols(StateId n,
561  SymbolNumberSet &input_symbols,
562  StateIdSet &visited_nodes);
563  SymbolNumber number_of_input_symbols(void);
564 
565  TransitionTableIndex count_transitions(void) const;
566 
567  void display_states() const;
568  void display_tables() const;
569 
570  template<class T> TransducerTable<T> make_index_table(
571  TransitionTableIndex index_table_size) const;
572  template<class T> TransducerTable<T> make_transition_table() const;
573 public:
574  ConvertTransducer(TransduceR * tr, bool weighted);
575  ~ConvertTransducer();
576 
577  ConvertIdNumberMap& get_id_number_map() {return *id_number_map;}
578  ConvertTransducerAlphabet& get_alphabet() {return alphabet;}
579  ConvertFstState& get_state(StateIdNumber s) {return *states[s];}
580  bool is_weighted() const {return header.probe_flag(Weighted);}
581 
582  Transducer* to_transducer() const;
583 
584  // exposed to this module during the constructor
585  static ConvertTransducer* constructing_transducer;
586 };
587 
588 #endif // HAVE_OPENFST
589 
590 }
591 
592 #endif // include guard
An error happened probably due to a bug in the HFST code.
Definition: HfstExceptionDefs.h:378
#define HFST_THROW_MESSAGE(E, M)
Macro to throw an exception of type E with message M. Use THROW instead of regular throw with subclas...
Definition: HfstExceptionDefs.h:35