HFST - Helsinki Finite-State Transducer Technology API  version 3.7.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
3831_transducer.h
1 // -*- mode: c++; -*-
2 // This program is free software: you can redistribute it and/or modify
3 // it under the terms of the GNU General Public License as published by
4 // the Free Software Foundation, version 3 of the License.
5 //
6 // This program is distributed in the hope that it will be useful,
7 // but WITHOUT ANY WARRANTY; without even the implied warranty of
8 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 // GNU General Public License for more details.
10 //
11 // You should have received a copy of the GNU General Public License
12 // along with this program. If not, see <http://www.gnu.org/licenses/>.
13 
14 #ifndef _HFST_OL_TRANSDUCER_TRANSDUCER_H_
15 #define _HFST_OL_TRANSDUCER_TRANSDUCER_H_
16 
17 #include <vector>
18 #include <set>
19 #include <iostream>
20 #include <limits>
21 #include <string>
22 #include <cstdlib>
23 #include <cstring>
24 #include <climits>
25 #include <utility>
26 #include <deque>
27 #include <queue>
28 #include <stdexcept>
29 
30 #include "../../HfstExceptionDefs.h"
31 #include "../../HfstFlagDiacritics.h"
32 #include "../../HfstSymbolDefs.h"
33 
35 namespace hfst_ol {
36 using hfst::FdOperation;
37 using hfst::FdState;
38 using hfst::FdTable;
39 
40 // using namespace hfst;
41 
42 
43 typedef unsigned short SymbolNumber;
44 typedef unsigned int TransitionTableIndex;
45 typedef unsigned int TransitionNumber;
46 typedef unsigned int StateIdNumber;
47 typedef short ValueNumber;
48 typedef float Weight;
49 typedef std::set<SymbolNumber> SymbolNumberSet;
50 typedef std::vector<SymbolNumber> SymbolNumberVector;
51 typedef std::set<TransitionTableIndex> TransitionTableIndexSet;
52 typedef std::vector<std::string> SymbolTable;
53 
54 // for lookup
55 typedef std::pair<Weight, std::vector<std::string> > HfstOneLevelPath;
56 typedef std::set<HfstOneLevelPath> HfstOneLevelPaths;
57 typedef std::vector<std::string> StringVector;
58 
59 // for ospell
60 typedef std::vector<short> FlagDiacriticState;
61 typedef std::map<SymbolNumber, hfst::FdOperation> OperationMap;
62 typedef std::map<std::string, SymbolNumber> StringSymbolMap;
63 class STransition;
64 
65 // for epsilon loop checking
66 struct TraversalState
67 {
68  TransitionTableIndex index;
69  FlagDiacriticState flags;
70  TraversalState(TransitionTableIndex i, FlagDiacriticState f):
71  index(i), flags(f) {}
72  bool operator==(TraversalState & rhs);
73 };
74 typedef std::vector<std::vector<TraversalState> > PositionStates;
75 
76 const SymbolNumber NO_SYMBOL_NUMBER = std::numeric_limits<SymbolNumber>::max();
77 const TransitionTableIndex NO_TABLE_INDEX =
78  std::numeric_limits<TransitionTableIndex>::max();
79 const Weight INFINITE_WEIGHT = static_cast<float>(NO_TABLE_INDEX);
80 
81 enum HeaderFlag {Weighted, Deterministic, Input_deterministic, Minimized,
82  Cyclic, Has_epsilon_epsilon_transitions,
83  Has_input_epsilon_transitions, Has_input_epsilon_cycles,
84  Has_unweighted_input_epsilon_cycles};
85 
86 // This is 2^31, hopefully equal to UINT_MAX/2 rounded up.
87 // For some profound reason it can't be replaced with (UINT_MAX+1)/2.
88 const TransitionTableIndex TRANSITION_TARGET_TABLE_START = 2147483648u;
89 const unsigned int MAX_IO_LEN = 10000;
90 const unsigned int MAX_RECURSION_DEPTH = 5000;
91 
92 // This function is queried to check whether we should do the
93 // single-character ascii lookup tokenization or the regular
94 // trie-tokenization
95 bool should_ascii_tokenize(unsigned char c);
96 
97 inline bool indexes_transition_table(const TransitionTableIndex i)
98 {
99  return i >= TRANSITION_TARGET_TABLE_START;
100 }
101 inline bool indexes_transition_index_table(const TransitionTableIndex i)
102 {
103  return i < TRANSITION_TARGET_TABLE_START;
104 }
105 
106 class TransducerHeader
107 {
108 private:
109  SymbolNumber number_of_input_symbols;
110  SymbolNumber number_of_symbols;
111  TransitionTableIndex size_of_transition_index_table;
112  TransitionTableIndex size_of_transition_target_table;
113 
114  StateIdNumber number_of_states;
115  TransitionNumber number_of_transitions;
116 
117  bool weighted;
118  bool deterministic;
119  bool input_deterministic;
120  bool minimized;
121  bool cyclic;
122  bool has_epsilon_epsilon_transitions;
123  bool has_input_epsilon_transitions;
124  bool has_input_epsilon_cycles;
125  bool has_unweighted_input_epsilon_cycles;
126 
127  static void header_error()
128  {
130  }
131 
132  template<class T>
133  static T read_property(std::istream& is)
134  {
135  T p;
136  is.read(reinterpret_cast<char*>(&p), sizeof(T));
137  return p;
138  }
139  template<class T>
140  static void write_property(T prop, std::ostream& os)
141  { os.write(reinterpret_cast<const char*>(&prop), sizeof(prop)); }
142  static bool read_bool_property(std::istream& is)
143  {
144  unsigned int prop;
145  is.read(reinterpret_cast<char*>(&prop), sizeof(unsigned int));
146  if(prop == 0)
147  return false;
148  if(prop == 1)
149  return true;
150  header_error();
151  return false;
152  }
153  static void write_bool_property(bool value, std::ostream& os)
154  {
155  unsigned int prop = (value?1:0);
156  os.write(reinterpret_cast<char*>(&prop), sizeof(prop));
157  }
158 public:
159  TransducerHeader(bool weights):
160  number_of_input_symbols(0),
161  number_of_symbols(1), // epsilon
162  size_of_transition_index_table(1),
163  size_of_transition_target_table(0),
164  number_of_states(1),
165  number_of_transitions(0),
166  weighted(weights),
167  deterministic(true),
168  input_deterministic(true),
169  minimized(true),
170  cyclic(false),
171  has_epsilon_epsilon_transitions(false),
172  has_input_epsilon_transitions(false),
173  has_input_epsilon_cycles(false),
174  has_unweighted_input_epsilon_cycles(false)
175  {}
176 
177  // a basic constructor that's only told information we
178  // actually use at the moment
179  TransducerHeader(
180  SymbolNumber input_symbols,
181  SymbolNumber symbols,
182  TransitionTableIndex transition_index_table,
183  TransitionTableIndex transition_table,
184  bool weights):
185  number_of_input_symbols(input_symbols),
186  number_of_symbols(symbols), // epsilon
187  size_of_transition_index_table(transition_index_table),
188  size_of_transition_target_table(transition_table),
189  number_of_states(0),
190  number_of_transitions(0),
191  weighted(weights),
192  deterministic(true),
193  input_deterministic(true),
194  minimized(true),
195  cyclic(false),
196  has_epsilon_epsilon_transitions(false),
197  has_input_epsilon_transitions(false),
198  has_input_epsilon_cycles(false),
199  has_unweighted_input_epsilon_cycles(false)
200  { }
201 
202 
203  TransducerHeader(std::istream& is):
204  number_of_input_symbols(read_property<SymbolNumber>(is)),
205  number_of_symbols(read_property<SymbolNumber>(is)),
206  size_of_transition_index_table(
207  read_property<TransitionTableIndex>(is)),
208  size_of_transition_target_table(
209  read_property<TransitionTableIndex>(is)),
210  number_of_states(read_property<StateIdNumber>(is)),
211  number_of_transitions(read_property<TransitionNumber>(is)),
212  weighted(read_bool_property(is)),
213  deterministic(read_bool_property(is)),
214  input_deterministic(read_bool_property(is)),
215  minimized(read_bool_property(is)),
216  cyclic(read_bool_property(is)),
217  has_epsilon_epsilon_transitions(read_bool_property(is)),
218  has_input_epsilon_transitions(read_bool_property(is)),
219  has_input_epsilon_cycles(read_bool_property(is)),
220  has_unweighted_input_epsilon_cycles(read_bool_property(is))
221  {
222  if(!is) {
224  }
225  }
226 
227  SymbolNumber symbol_count(void) const { return number_of_symbols; }
228  SymbolNumber input_symbol_count(void) const {
229  return number_of_input_symbols;
230  }
231  void increment_symbol_count(void)
232  {++number_of_symbols; ++number_of_input_symbols;}
233 
234  TransitionTableIndex index_table_size(void) const
235  { return size_of_transition_index_table; }
236  TransitionTableIndex target_table_size(void) const
237  { return size_of_transition_target_table; }
238 
239  bool probe_flag(HeaderFlag flag) const
240  {
241  switch (flag) {
242  case Weighted:
243  return weighted;
244  case Deterministic:
245  return deterministic;
246  case Input_deterministic:
247  return input_deterministic;
248  case Minimized:
249  return minimized;
250  case Cyclic:
251  return cyclic;
252  case Has_epsilon_epsilon_transitions:
253  return has_epsilon_epsilon_transitions;
254  case Has_input_epsilon_transitions:
255  return has_input_epsilon_transitions;
256  case Has_input_epsilon_cycles:
257  return has_input_epsilon_cycles;
258  case Has_unweighted_input_epsilon_cycles:
259  return has_unweighted_input_epsilon_cycles;
260  }
261  return false;
262  }
263 
264  void set_flag(HeaderFlag flag, bool value)
265  {
266  switch (flag) {
267  case Weighted:
268  weighted = true;
269  break;
270  case Deterministic:
271  deterministic = true;
272  break;
273  case Input_deterministic:
274  input_deterministic = true;
275  break;
276  case Minimized:
277  minimized = true;
278  break;
279  case Cyclic:
280  cyclic = true;
281  break;
282  case Has_epsilon_epsilon_transitions:
283  has_epsilon_epsilon_transitions = true;
284  break;
285  case Has_input_epsilon_transitions:
286  has_input_epsilon_transitions = true;
287  break;
288  case Has_input_epsilon_cycles:
289  has_input_epsilon_cycles = true;
290  break;
291  case Has_unweighted_input_epsilon_cycles:
292  has_unweighted_input_epsilon_cycles = true;
293  default:
294  return;
295  }
296  }
297 
298  void display() const
299  {
300  std::cout << "Transducer properties:" << std::endl
301  << " number_of_symbols: "
302  << number_of_symbols << std::endl
303  << " number_of_input_symbols: "
304  << number_of_input_symbols << std::endl
305  << " size_of_transition_index_table: "
306  << size_of_transition_index_table << std::endl
307  << " size_of_transition_target_table: "
308  << size_of_transition_target_table << std::endl
309  << " number_of_states: "
310  << number_of_states << std::endl
311  << " number_of_transitions: "
312  << number_of_transitions << std::endl
313  << " weighted: "
314  << weighted << std::endl
315  << " deterministic: "
316  << deterministic << std::endl
317  << " input_deterministic: "
318  << input_deterministic << std::endl
319  << " minimized: "
320  << minimized << std::endl
321  << " cyclic: "
322  << cyclic << std::endl
323  << " has_epsilon_epsilon_transitions: "
324  << has_epsilon_epsilon_transitions << std::endl
325  << " has_input_epsilon_transitions: "
326  << has_input_epsilon_transitions << std::endl
327  << " has_input_epsilon_cycles: "
328  << has_input_epsilon_cycles << std::endl
329  << " has_unweighted_input_epsilon_cycles: "
330  << has_unweighted_input_epsilon_cycles << std::endl;
331  }
332 
333  void write(std::ostream& os) const
334  {
335  write_property(number_of_input_symbols, os);
336  write_property(number_of_symbols, os);
337  write_property(size_of_transition_index_table, os);
338  write_property(size_of_transition_target_table, os);
339  write_property(number_of_states, os);
340  write_property(number_of_transitions, os);
341  write_bool_property(weighted, os);
342  write_bool_property(deterministic, os);
343  write_bool_property(input_deterministic, os);
344  write_bool_property(minimized, os);
345  write_bool_property(cyclic, os);
346  write_bool_property(has_epsilon_epsilon_transitions, os);
347  write_bool_property(has_input_epsilon_transitions, os);
348  write_bool_property(has_input_epsilon_cycles, os);
349  write_bool_property(has_unweighted_input_epsilon_cycles, os);
350  }
351 
352  friend class ConvertTransducerHeader;
353 };
354 
355 class TransducerAlphabet
356 {
357 protected:
358  SymbolTable symbol_table;
359  hfst::FdTable<SymbolNumber> fd_table;
360  SymbolNumber unknown_symbol;
361  SymbolNumber default_symbol;
362  SymbolNumber identity_symbol;
363 
364 
365 public:
366  TransducerAlphabet()
367  {
368  symbol_table.push_back("@_EPSILON_SYMBOL_@");
369  unknown_symbol = NO_SYMBOL_NUMBER;
370  default_symbol = NO_SYMBOL_NUMBER;
371  identity_symbol = NO_SYMBOL_NUMBER;
372  }
373  TransducerAlphabet(std::istream& is, SymbolNumber symbol_count);
374  TransducerAlphabet(const SymbolTable& st);
375 
376  void display() const;
377 
378  void write(std::ostream& os) const
379  {
380  for(SymbolTable::const_iterator i = symbol_table.begin();
381  i != symbol_table.end(); i++)
382  {
383  os << *i;
384  os.put('\0');
385  }
386  }
387 
388  bool has_flag_diacritics() const
389  { return fd_table.num_features() > 0; }
390  bool is_flag_diacritic(SymbolNumber symbol) const
391  { return fd_table.is_diacritic(symbol); }
392  bool is_like_epsilon(SymbolNumber symbol) const;
393 
394  const SymbolTable& get_symbol_table() const
395  { return symbol_table; }
396 
397  const std::string string_from_symbol(const SymbolNumber symbol) const
398  // represent epsilon as blank string
399  { return (symbol == 0) ? "" : symbol_table[symbol]; }
400 
401  SymbolNumber symbol_from_string(const std::string symbol_string) const;
402  StringSymbolMap build_string_symbol_map(void) const;
403  const hfst::FdTable<SymbolNumber>& get_fd_table() const
404  { return fd_table; }
405  const hfst::FdOperation * get_operation(SymbolNumber symbol) const
406  {
407  return fd_table.get_operation(symbol);
408  }
409  SymbolNumber get_unknown_symbol(void) const
410  { return unknown_symbol; }
411  SymbolNumber get_default_symbol(void) const
412  { return default_symbol; }
413  SymbolNumber get_identity_symbol(void) const
414  { return identity_symbol; }
415  void add_symbol(char * symbol);
416 
417 };
418 
419 class TransitionIndex
420 {
421 protected:
422  SymbolNumber input_symbol;
423  TransitionTableIndex first_transition_index;
424 public:
425  static const size_t size =
426  sizeof(SymbolNumber) + sizeof(TransitionTableIndex);
427  TransitionIndex(): input_symbol(NO_SYMBOL_NUMBER),
428  first_transition_index(NO_TABLE_INDEX) {}
429  TransitionIndex(SymbolNumber input,
430  TransitionTableIndex first_transition):
431  input_symbol(input), first_transition_index(first_transition) {}
432 
433  TransitionIndex(std::istream& is):
434  input_symbol(NO_SYMBOL_NUMBER), first_transition_index(0)
435  {
436  is.read(reinterpret_cast<char*>(&input_symbol),
437  sizeof(SymbolNumber));
438  is.read(reinterpret_cast<char*>(&first_transition_index),
439  sizeof(TransitionTableIndex));
440  }
441  // A constructor for reading from a char array at p
442  TransitionIndex(char * p):
443  input_symbol(*((SymbolNumber*) p)),
444  first_transition_index((*(TransitionTableIndex*)
445  (p + sizeof(SymbolNumber)))) {}
446  virtual ~TransitionIndex() {}
447 
448  void write(std::ostream& os, bool weighted) const
449  {
450  os.write(reinterpret_cast<const char*>(&input_symbol),
451  sizeof(input_symbol));
452  if(!weighted and input_symbol == NO_SYMBOL_NUMBER and
453  first_transition_index != NO_TABLE_INDEX) {
454  // Make sure that we write the correct type of final index
455  unsigned int unweighted_final_index = 1;
456  os.write(reinterpret_cast<const char*>(&unweighted_final_index),
457  sizeof(first_transition_index));
458  } else {
459  os.write(reinterpret_cast<const char*>(
460  &first_transition_index),
461  sizeof(first_transition_index));
462  }
463  }
464 
465  void display() const;
466 
467  TransitionTableIndex get_target(void) const
468  { return first_transition_index; }
469  SymbolNumber get_input_symbol(void) const
470  { return input_symbol; }
471 
472  bool matches(const SymbolNumber s) const;
473  virtual bool final(void) const;
474  virtual Weight final_weight(void) const { return 0.0; }
475 
476  static TransitionIndex create_final()
477  { return TransitionIndex(NO_SYMBOL_NUMBER, 1); }
478 };
479 
480 class TransitionWIndex : public TransitionIndex
481 {
482 public:
483  TransitionWIndex(): TransitionIndex() {}
484  TransitionWIndex(SymbolNumber input,
485  TransitionTableIndex first_transition):
486  TransitionIndex(input, first_transition) {}
487  TransitionWIndex(std::istream& is):
488  TransitionIndex(is) {}
489  TransitionWIndex(char * p):
490  TransitionIndex(p) {}
491 
492  Weight final_weight(void) const;
493 
494  static TransitionWIndex create_final()
495  { return TransitionWIndex(NO_SYMBOL_NUMBER, 0); }
496 
497  static TransitionWIndex create_final(Weight w)
498  {
499  union to_weight
500  {
501  TransitionTableIndex i;
502  Weight w;
503  } weight;
504  weight.w = w;
505  return TransitionWIndex(NO_SYMBOL_NUMBER, weight.i);
506  }
507 };
508 
509 class Transition
510 {
511 protected:
512  SymbolNumber input_symbol;
513  SymbolNumber output_symbol;
514  TransitionTableIndex target_index;
515 public:
516  static const size_t size = 2 * sizeof(SymbolNumber) +
517  sizeof(TransitionTableIndex);
518  Transition(SymbolNumber input, SymbolNumber output,
519  TransitionTableIndex target, Weight bogus=0.0f):
520  input_symbol(input), output_symbol(output), target_index(target)
521  {(void)bogus; bogus=0.0f;}
522  Transition(bool final, Weight bogus=0.0f):
523  input_symbol(NO_SYMBOL_NUMBER), output_symbol(NO_SYMBOL_NUMBER),
524  target_index(final?1:NO_TABLE_INDEX) {(void)bogus; bogus=0.0f;}
525  Transition(std::istream& is):
526  input_symbol(NO_SYMBOL_NUMBER), output_symbol(NO_SYMBOL_NUMBER),
527  target_index(0)
528  {
529  is.read(reinterpret_cast<char*>(&input_symbol),
530  sizeof(SymbolNumber));
531  is.read(reinterpret_cast<char*>(&output_symbol),
532  sizeof(SymbolNumber));
533  is.read(reinterpret_cast<char*>(&target_index),
534  sizeof(target_index));
535  }
536  // A constructor for reading from char array
537  Transition(char * p):
538  input_symbol(*(SymbolNumber*) p),
539  output_symbol(*(SymbolNumber*) (p + sizeof(SymbolNumber))),
540  target_index(*(TransitionTableIndex*) (p + 2 * sizeof(SymbolNumber)))
541  {}
542 
543  virtual ~Transition() {}
544 
545  virtual void write(std::ostream& os, bool weighted) const
546  {
547  os.write(reinterpret_cast<const char*>(&input_symbol),
548  sizeof(input_symbol));
549  os.write(reinterpret_cast<const char*>(&output_symbol),
550  sizeof(output_symbol));
551  os.write(reinterpret_cast<const char*>(&target_index),
552  sizeof(target_index));
553  if (weighted) {
554  os << 0.0f;
555  }
556  }
557 
558  virtual void display() const;
559 
560  TransitionTableIndex get_target(void) const {return target_index;}
561  SymbolNumber get_output_symbol(void) const {return output_symbol;}
562  SymbolNumber get_input_symbol(void) const {return input_symbol;}
563 
564  bool matches(const SymbolNumber s) const;
565  virtual bool final(void) const;
566  virtual Weight get_weight(void) const { return 0.0; }
567 };
568 
569 class TransitionW : public Transition
570 {
571 protected:
572  Weight transition_weight;
573 public:
574  static const size_t size = 2 * sizeof(SymbolNumber) +
575  sizeof(TransitionTableIndex) + sizeof(Weight);
576 
577  TransitionW(SymbolNumber input, SymbolNumber output,
578  TransitionTableIndex target, Weight w):
579  Transition(input, output, target), transition_weight(w) {}
580  TransitionW(bool final, Weight w):
581  Transition(final), transition_weight(w) {}
582  TransitionW(std::istream& is): Transition(is), transition_weight(0.0f)
583  {is.read(reinterpret_cast<char*>(&transition_weight), sizeof(Weight));}
584  TransitionW(char * p):
585  Transition(p),
586  transition_weight(*((Weight*)
587  (p + 2 * sizeof(SymbolNumber)
588  + sizeof(TransitionTableIndex))))
589  {}
590 
591  void write(std::ostream& os, bool weighted) const
592  {
593  Transition::write(os, false);
594  if (weighted) {
595  os.write(reinterpret_cast<const char*>(&transition_weight),
596  sizeof(transition_weight));
597  }
598  }
599 
600  void display() const;
601 
602  Weight get_weight(void) const { return transition_weight; }
603 };
604 
605 
606 template <class T>
607 class TransducerTable
608 {
609 protected:
610  std::vector<T> table;
611 public:
612  TransducerTable(): table() {}
613  TransducerTable(size_t size, const T& entry): table(size, entry) {}
614  TransducerTable(
615  std::istream& is, TransitionTableIndex index_count): table()
616  {
617  char * p = (char*) malloc(T::size * index_count);
618  is.read(p, T::size * index_count);
619  char * p_orig = p;
620  while(index_count) {
621  table.push_back(T(p));
622  --index_count;
623  p += T::size;
624  }
625  free(p_orig);
626  }
627  TransducerTable(const TransducerTable& t): table(t.table) {}
628 
629  void append(const T& v) {table.push_back(v);}
630  void set(size_t index, const T& v) {table[index] = v;}
631 
632  const T& operator[](TransitionTableIndex i) const
633  {
634  return (i < TRANSITION_TARGET_TABLE_START) ?
635  table[i] : table[i-TRANSITION_TARGET_TABLE_START];
636  }
637 
638  void display(bool transition_table) const
639  {
640  for(size_t i=0;i<table.size();i++)
641  {
642  std::cout << i;
643  if(transition_table)
644  std::cout << "/" << i+TRANSITION_TARGET_TABLE_START;
645  std::cout << ": ";
646  table[i].display();
647  }
648  }
649 
650  unsigned int size() const {return table.size();}
651 };
652 
653 class TransducerTablesInterface
654 {
655 public:
656  virtual ~TransducerTablesInterface() {}
657 
658  virtual const TransitionIndex& get_index(
659  TransitionTableIndex i) const = 0;
660  virtual const Transition& get_transition(
661  TransitionTableIndex i) const = 0;
662  virtual Weight get_weight(
663  TransitionTableIndex i) const = 0;
664  virtual SymbolNumber get_transition_input(
665  TransitionTableIndex i) const = 0;
666  virtual SymbolNumber get_transition_output(
667  TransitionTableIndex i) const = 0;
668  virtual TransitionTableIndex get_transition_target(
669  TransitionTableIndex i) const = 0;
670  virtual bool get_transition_finality(
671  TransitionTableIndex i) const = 0;
672  virtual SymbolNumber get_index_input(
673  TransitionTableIndex i) const = 0;
674  virtual TransitionTableIndex get_index_target(
675  TransitionTableIndex i) const = 0;
676  virtual bool get_index_finality(
677  TransitionTableIndex i) const = 0;
678  virtual Weight get_final_weight(
679  TransitionTableIndex i) const = 0;
680 
681  virtual void display() const {}
682 };
683 
684 template <class T1, class T2>
685 class TransducerTables : public TransducerTablesInterface
686 {
687 protected:
688  TransducerTable<T1> index_table;
689  TransducerTable<T2> transition_table;
690 public:
691  TransducerTables(std::istream& is, TransitionTableIndex index_table_size,
692  TransitionTableIndex transition_table_size):
693  index_table(
694  is, index_table_size),
695  transition_table(is, transition_table_size) { }
696 
697  TransducerTables(): index_table(1, T1::create_final()),
698  transition_table() {}
699  TransducerTables(const TransducerTable<T1>& index_table,
700  const TransducerTable<T2>& transition_table):
701  index_table(index_table), transition_table(transition_table) {}
702 
703  const TransitionIndex& get_index(TransitionTableIndex i) const
704  {return index_table[i];}
705  const Transition& get_transition(TransitionTableIndex i) const
706  {return transition_table[i];}
707  Weight get_weight(TransitionTableIndex i) const
708  { return transition_table[i].get_weight(); }
709  SymbolNumber get_transition_input(TransitionTableIndex i) const
710  { return transition_table[i].get_input_symbol(); }
711  SymbolNumber get_transition_output(TransitionTableIndex i) const
712  { return transition_table[i].get_output_symbol(); }
713  TransitionTableIndex get_transition_target(TransitionTableIndex i) const
714  { return transition_table[i].get_target(); }
715  bool get_transition_finality(TransitionTableIndex i) const
716  { return transition_table[i].final(); }
717  SymbolNumber get_index_input(TransitionTableIndex i) const
718  { return index_table[i].get_input_symbol(); }
719  TransitionTableIndex get_index_target(TransitionTableIndex i) const
720  { return index_table[i].get_target(); }
721  bool get_index_finality(TransitionTableIndex i) const
722  { return index_table[i].final(); }
723  Weight get_final_weight(TransitionTableIndex i) const
724  { return index_table[i].final_weight(); }
725 
726 
727  void display() const
728  {
729  std::cout << "Transition index table:" << std::endl;
730  index_table.display(false);
731  std::cout << "Transition table:" << std::endl;
732  transition_table.display(true);
733  }
734 };
735 
736 
737 // There follow some classes for implementing lookup
738 
739 class OlLetterTrie;
740 typedef std::vector<OlLetterTrie*> OlLetterTrieVector;
741 
742 class OlLetterTrie
743 {
744 private:
745  OlLetterTrieVector letters;
746  SymbolNumberVector symbols;
747 
748 public:
749  OlLetterTrie(void):
750  letters(UCHAR_MAX, static_cast<OlLetterTrie*>(NULL)),
751  symbols(UCHAR_MAX,NO_SYMBOL_NUMBER)
752  {}
753 
754  void add_string(const char * p,SymbolNumber symbol_key);
755 
756  SymbolNumber find_key(char ** p);
757 
758 };
759 
760 class Encoder {
761 
762 protected:
763  SymbolNumber number_of_input_symbols;
764  OlLetterTrie letters;
765  SymbolNumberVector ascii_symbols;
766 
767  void read_input_symbols(const SymbolTable & kt);
768  void read_input_symbol(const char * symbol, const int symbol_number);
769 
770 public:
771  Encoder(const SymbolTable & st, SymbolNumber input_symbol_count):
772  number_of_input_symbols(input_symbol_count),
773  ascii_symbols(UCHAR_MAX,NO_SYMBOL_NUMBER)
774  {
775  read_input_symbols(st);
776  }
777 
778  SymbolNumber find_key(char ** p);
779 
780  friend class Transducer;
781  friend class PmatchContainer;
782 };
783 
784 // A vector that can be written to at any position, so that it
785 // adds new elements if the desired element isn't already present.
786 class Tape: public SymbolNumberVector
787 {
788 public:
789  void write(unsigned int i, SymbolNumber s)
790  {
791  if (this->size() > i) {
792  this->operator[](i) = s;
793  } else {
794  while (this->size() <= i) {
795  this->push_back(NO_SYMBOL_NUMBER);
796  }
797  this->operator[](i) = s;
798  }
799  }
800 };
801 
805 {
806 protected:
807  TransducerHeader* header;
808  TransducerAlphabet* alphabet;
809 
810  TransducerTablesInterface* tables;
811 
812 
813  void load_tables(std::istream& is);
814 
815  // for lookup
816  Weight current_weight;
817  HfstOneLevelPaths * lookup_paths;
818  Encoder * encoder;
819  Tape input_tape;
820  Tape output_tape;
821  hfst::FdState<SymbolNumber> flag_state;
822  // This is to keep track of whether we're going to take a default transition
823  bool found_transition;
824 
825  ssize_t max_lookups;
826  unsigned int recursion_depth_left;
827 
828  void try_epsilon_transitions(unsigned int input_tape_pos,
829  unsigned int output_tape_pos,
830  TransitionTableIndex i);
831 
832  void try_epsilon_indices(unsigned int input_tape_pos,
833  unsigned int output_tape_pos,
834  TransitionTableIndex i);
835 
836  void find_transitions(SymbolNumber input,
837  unsigned int input_tape_pos,
838  unsigned int output_tape_pos,
839  TransitionTableIndex i);
840 
841  void find_index(SymbolNumber input,
842  unsigned int input_tape_pos,
843  unsigned int output_tape_pos,
844  TransitionTableIndex i);
845 
846  void get_analyses(unsigned int input_tape_pos,
847  unsigned int output_tape_pos,
848  TransitionTableIndex i);
849 
850 
851 public:
852  Transducer(std::istream& is);
853  Transducer(bool weighted);
854  Transducer(Transducer * t);
855  Transducer();
856  Transducer(const TransducerHeader& header,
857  const TransducerAlphabet& alphabet,
858  const TransducerTable<TransitionIndex>& index_table,
859  const TransducerTable<Transition>& transition_table);
860  Transducer(const TransducerHeader& header,
861  const TransducerAlphabet& alphabet,
862  const TransducerTable<TransitionWIndex>& index_table,
863  const TransducerTable<TransitionW>& transition_table);
864  virtual ~Transducer();
865 
866  void write(std::ostream& os) const;
867  Transducer * copy(Transducer * t, bool weighted = false);
868  void display() const;
869 
870  const TransducerHeader& get_header() const
871  { return *header; }
872  const TransducerAlphabet& get_alphabet() const
873  { return *alphabet; }
874  const Encoder& get_encoder(void) const
875  { return *encoder; }
876  const hfst::FdTable<SymbolNumber>& get_fd_table() const
877  { return alphabet->get_fd_table(); }
878  const SymbolTable& get_symbol_table() const
879  { return alphabet->get_symbol_table(); }
880 
881 
882  const TransitionIndex& get_index(TransitionTableIndex i) const
883  { return tables->get_index(i); }
884  const Transition& get_transition(TransitionTableIndex i) const
885  { return tables->get_transition(i); }
886  bool final_index(TransitionTableIndex i) const
887  {
888  if (indexes_transition_table(i)) {
889  return tables->get_transition_finality(i);
890  } else {
891  return tables->get_index_finality(i);
892  }
893  }
894  bool is_infinitely_ambiguous(void) const
895  {
896  return header->probe_flag(Has_input_epsilon_cycles);
897  }
898 
899  bool is_lookup_infinitely_ambiguous(const StringVector & s);
900  bool is_lookup_infinitely_ambiguous(const std::string & input);
901  void find_loop_epsilon_transitions(unsigned int input_pos,
902  TransitionTableIndex i,
903  PositionStates position_states);
904  void find_loop_epsilon_indices(unsigned int input_pos,
905  TransitionTableIndex i,
906  PositionStates position_states);
907  void find_loop_transitions(SymbolNumber input,
908  unsigned int input_pos,
909  TransitionTableIndex i,
910  PositionStates position_states);
911  void find_loop_index(SymbolNumber input,
912  unsigned int input_pos,
913  TransitionTableIndex i,
914  PositionStates position_states);
915  void find_loop(unsigned int input_pos,
916  TransitionTableIndex i,
917  PositionStates position_states);
918 
919  TransducerTable<TransitionWIndex> & copy_windex_table();
920  TransducerTable<TransitionW> & copy_transitionw_table();
921  TransducerTable<TransitionIndex> & copy_index_table();
922  TransducerTable<Transition> & copy_transition_table();
923 
924  // state_index must be an index to a state which is defined as either:
925  // (1) the start of a set of entries in the transition index table, or
926  // (2) the boundary before a set of entries in the transition table, in
927  // which case the following entries will all have the same input symbol
928  // This function will return a vector of indices to the transition table,
929  // i.e. the arcs from the given state
930  TransitionTableIndexSet get_transitions_from_state(
931  TransitionTableIndex state_index) const;
932 
933 
934  bool initialize_input(const char * input_str);
935  HfstOneLevelPaths * lookup_fd(const StringVector & s, ssize_t limit = -1);
936  /* Tokenize and lookup, accounting for flag diacritics, the surface string
937  \a s. The return value, a pointer to HfstOneLevelPaths
938  (which is a set) of analyses, is newly allocated.
939  */
940  HfstOneLevelPaths * lookup_fd(const std::string & s, ssize_t limit = -1);
941  HfstOneLevelPaths * lookup_fd(const char * s, ssize_t limit = -1);
942  void note_analysis(void);
943 
944  // Methods for supporting ospell
945  SymbolNumber get_unknown_symbol(void) const
946  { return alphabet->get_unknown_symbol(); }
947  StringSymbolMap get_string_symbol_map(void) const
948  { return alphabet->build_string_symbol_map(); }
949  STransition take_epsilons(const TransitionTableIndex i) const;
950  STransition take_epsilons_and_flags(const TransitionTableIndex i);
951  STransition take_non_epsilons(const TransitionTableIndex i,
952  const SymbolNumber symbol) const;
953  TransitionTableIndex next(const TransitionTableIndex i,
954  const SymbolNumber symbol) const;
955  TransitionTableIndex next_e(const TransitionTableIndex i) const;
956  bool has_transitions(const TransitionTableIndex i,
957  const SymbolNumber symbol) const;
958  bool has_epsilons_or_flags(const TransitionTableIndex i);
959  Weight final_weight(const TransitionTableIndex i) const;
960  bool is_flag(const SymbolNumber symbol)
961  { return alphabet->is_flag_diacritic(symbol); }
962  bool is_weighted(void)
963  { return header->probe_flag(Weighted);}
964 
965 
966  friend class ConvertTransducer;
967 };
968 
969 class STransition{
970 public:
971  TransitionTableIndex index;
972  SymbolNumber symbol;
973  Weight weight;
974 
975  STransition(TransitionTableIndex i,
976  SymbolNumber s):
977  index(i),
978  symbol(s),
979  weight(0.0)
980  {}
981 
982  STransition(TransitionTableIndex i,
983  SymbolNumber s,
984  Weight w):
985  index(i),
986  symbol(s),
987  weight(w)
988  {}
989 
990 };
991 
992 typedef std::pair<std::string, Weight> StringWeightPair;
993 
994 class StringWeightComparison
995 /* results are reversed by default because greater weights represent
996  worse results - to reverse the reversal, give a true argument*/
997 
998 {
999  bool reverse;
1000 public:
1001  StringWeightComparison(bool reverse_result=false):
1002  reverse(reverse_result)
1003  {}
1004 
1005  bool operator() (StringWeightPair lhs, StringWeightPair rhs)
1006  { // return true when we want rhs to appear before lhs
1007  if (reverse) {
1008  return (lhs.second < rhs.second);
1009  } else {
1010  return (lhs.second > rhs.second);
1011  }
1012  }
1013 };
1014 
1015 typedef std::priority_queue<StringWeightPair,
1016  std::vector<StringWeightPair>,
1017  StringWeightComparison> CorrectionQueue;
1018 typedef std::priority_queue<StringWeightPair,
1019  std::vector<StringWeightPair>,
1020  StringWeightComparison> AnalysisQueue;
1021 typedef std::priority_queue<StringWeightPair,
1022  std::vector<StringWeightPair>,
1023  StringWeightComparison> HyphenationQueue;
1024 
1025 /*class STransducer: public Transducer
1026  {
1027  protected:
1028  bool final_transition(TransitionTableIndex i)
1029  {
1030  return transitions[i]->final();
1031  }
1032 
1033  bool final_index(TransitionTableIndex i)
1034  {
1035  return indices[i]->final();
1036  }
1037 
1038 
1039  public:
1040  Transducer(FILE * f):
1041  header(TransducerHeader(f)),
1042  alphabet(TransducerAlphabet(f, header.symbol_count())),
1043  keys(alphabet.get_key_table()),
1044  index_reader(f,header.index_table_size()),
1045  transition_reader(f,header.target_table_size()),
1046  encoder(keys,header.input_symbol_count()),
1047  indices(index_reader()),
1048  transitions(transition_reader())
1049  {}
1050 
1051  TransitionIndexVector &indices;
1052 
1053  TransitionVector &transitions;
1054 
1055  SymbolNumber find_next_key(char ** p)
1056  {
1057  return encoder.find_key(p);
1058  }
1059 
1060 
1061  unsigned int get_state_size(void)
1062  {
1063  return alphabet.get_state_size();
1064  }
1065 
1066  std::vector<const char*> * get_symbol_table(void)
1067  {
1068  return &symbol_table;
1069  }
1070 
1071  SymbolNumber get_other(void)
1072  {
1073  return alphabet.get_other();
1074  }
1075 
1076  TransducerAlphabet * get_alphabet(void)
1077  {
1078  return &alphabet;
1079  }
1080 
1081  OperationMap * get_operations(void)
1082  {
1083  return alphabet.get_operation_map();
1084  }
1085 
1086  STransition take_epsilons(const TransitionTableIndex i) const;
1087  STransition take_epsilons_and_flags(const TransitionTableIndex i);
1088  STransition take_non_epsilons(const TransitionTableIndex i,
1089  const SymbolNumber symbol) const;
1090  TransitionTableIndex next(const TransitionTableIndex i,
1091  const SymbolNumber symbol) const;
1092  TransitionTableIndex next_e(const TransitionTableIndex i) const;
1093  bool has_transitions(const TransitionTableIndex i,
1094  const SymbolNumber symbol) const;
1095  bool has_epsilons_or_flags(const TransitionTableIndex i);
1096  bool is_final(const TransitionTableIndex i);
1097  Weight final_weight(const TransitionTableIndex i) const;
1098  bool is_flag(const SymbolNumber symbol)
1099  { return alphabet.is_flag(symbol); }
1100  bool is_weighted(void)
1101  { return header.probe_flag(Weighted);}
1102 
1103  };*/
1104 
1105 class TreeNode
1106 {
1107 public:
1108  SymbolNumberVector string;
1109  unsigned int input_state;
1110  TransitionTableIndex mutator_state;
1111  TransitionTableIndex lexicon_state;
1112  hfst::FdState<SymbolNumber> flag_state;
1113  Weight weight;
1114 
1115  TreeNode(SymbolNumberVector prev_string,
1116  unsigned int i,
1117  TransitionTableIndex mutator,
1118  TransitionTableIndex lexicon,
1119  hfst::FdState<SymbolNumber> state,
1120  Weight w):
1121  string(prev_string),
1122  input_state(i),
1123  mutator_state(mutator),
1124  lexicon_state(lexicon),
1125  flag_state(state),
1126  weight(w)
1127  { }
1128 
1129  TreeNode(hfst::FdState<SymbolNumber> start_state): // starting state node
1130  string(SymbolNumberVector()),
1131  input_state(0),
1132  mutator_state(0),
1133  lexicon_state(0),
1134  flag_state(start_state),
1135  weight(0.0)
1136  { }
1137 
1138  TreeNode update_lexicon(SymbolNumber next_symbol,
1139  TransitionTableIndex next_lexicon,
1140  Weight weight);
1141 
1142  TreeNode update_mutator(SymbolNumber next_symbol,
1143  TransitionTableIndex next_mutator,
1144  Weight weight);
1145 
1146  void increment_mutator(void);
1147 
1148  TreeNode update(SymbolNumber next_symbol,
1149  unsigned int next_input,
1150  TransitionTableIndex next_mutator,
1151  TransitionTableIndex next_lexicon,
1152  Weight weight);
1153 
1154  TreeNode update(SymbolNumber next_symbol,
1155  TransitionTableIndex next_mutator,
1156  TransitionTableIndex next_lexicon,
1157  Weight weight);
1158 
1159 
1160 };
1161 
1162 typedef std::deque<TreeNode> TreeNodeQueue;
1163 
1164 int nByte_utf8(unsigned char c);
1165 
1166 class InputString
1167 {
1168 
1169 private:
1170  SymbolNumberVector s;
1171 
1172 public:
1173  InputString():
1174  s(SymbolNumberVector())
1175  { }
1176 
1177  bool initialize(const Encoder & encoder, char * input, SymbolNumber other);
1178 
1179  unsigned int len(void)
1180  {
1181  return s.size();
1182  }
1183 
1184  SymbolNumber operator[](unsigned int i)
1185  {
1186  return s[i];
1187  }
1188 
1189 };
1190 
1191 class AlphabetTranslationException: public std::runtime_error
1192 { // "what" should hold the first untranslatable symbol
1193 public:
1194 
1195  AlphabetTranslationException(const std::string what):
1196  std::runtime_error(what)
1197  { }
1198 };
1199 
1203 class Speller
1204 {
1205 public:
1206  Transducer * mutator;
1207  Transducer * lexicon;
1208  InputString input;
1209  TreeNodeQueue queue;
1210  SymbolNumberVector alphabet_translator;
1211 // hfst::FdTable<SymbolNumber> operations;
1212  std::vector<std::string> symbol_table;
1213 
1214  Speller(Transducer * mutator_ptr, Transducer * lexicon_ptr):
1215  mutator(mutator_ptr),
1216  lexicon(lexicon_ptr),
1217  input(InputString()),
1218  queue(TreeNodeQueue()),
1219  alphabet_translator(SymbolNumberVector()),
1220 // operations(lexicon->get_fd_table()),
1221  symbol_table(lexicon->get_symbol_table())
1222  {
1223  build_alphabet_translator();
1224  }
1225 
1226  bool init_input(char * str, const Encoder & encoder, SymbolNumber other);
1227 
1228  void build_alphabet_translator(void);
1229  void lexicon_epsilons(void);
1230  void mutator_epsilons(void);
1231  void consume_input(void);
1232  void lexicon_consume(void);
1235  bool check(char * line);
1238  CorrectionQueue correct(char * line);
1239  std::string stringify(SymbolNumberVector symbol_vector);
1240 };
1241 
1242 }
1243 
1244 #endif
A compiled transducer format, suitable for fast lookup operations.
Definition: 3831_transducer.h:804
#define HFST_THROW(E)
Macro to throw an exception of type E. Use THROW instead of regular throw with subclasses of HfstExce...
Definition: HfstExceptionDefs.h:28
A spellchecker, constructed from two optimized-lookup transducer instances. An alphabet translator is...
Definition: 3831_transducer.h:1203
std::set< HfstOneLevelPath > HfstOneLevelPaths
A set of simple paths.
Definition: HfstDataTypes.h:101
std::pair< float, StringVector > HfstOneLevelPath
A path of one level of arcs with collected weight.
Definition: HfstDataTypes.h:97
CorrectionQueue correct(char *line)
Definition: ospell.cc:282
bool check(char *line)
Definition: ospell.cc:326
Transducer has wrong type.
Definition: HfstExceptionDefs.h:388