/*  __    
   (_ _|_ _ ._ _ ._ 
   __) |_(_)| | ||_)
                 |  
   The part-of-speech tagger Stomp, created by Jonas Sjobergh, jsh@nada.kth.se
   This version is from 2003-09-17.

   ****************************************************************************
   Stomp

   Copyright (C) 2003  Jonas Sjobergh
   
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

   ****************************************************************************

   I am aware that there are a lot of ugly code here, but I don't really feel
   like doing anything about that now. If you find any errors or inefficiencies
   in the code, please feel free to let me know, and I will most likely update
   the code. This tagger has been described in a paper (though not the 
   implementation details):
  @inproceedings{stomp,
   author = {Jonas Sj\"{o}bergh},
   title = {Stomp, a {POS}-tagger with a different view},
   booktitle = "Proceedings of RANLP-2003",
   address = "Borovets, Bulgaria",
   year = "2003"
  }
  available here:
  http://www.nada.kth.se/~jsh/publications/stomp03.ps

  In order to do anything useful you need some training data for the tagger. It 
  is intended to do part-of-speech tagging, but any other mark-up task where 
  context gives a useful clue should be possible (shallow parsing would be one
  example). The training data should be formated with one word (or whatever you want 
  to annotate) and one tag (or whatever...) on each line, separated by a tab.
  Stomp also needs a list of all tags used in the training data (though it is
  trivial to change the code to infer this from the training data), with one
  tag on each line. 

  You run the tagger like this:
  stomp <directory with training data> <file to annotate>

  In the directory there should be two files, one named 'corpus' containing the 
  training data and one named taglex, with the list of tags. The file to annotate 
  should have white space separated tokens (i.e. "Hello (again)." should be
  written like "Hello ( again ) ." or similarily). A tiny example corpus is included. 

  I used gcc 3.3 for Solaris when compiling Stomp. Any standard compliant C++ compiler
  should be ok, as long as you change the typedefs for the maps (using the standard
  map works, but is very slow). Any gcc after 3.0 should work, with the hash_map
  provided with the compiler.

  Some things to do to make Stomp more useful:
  * Handle numbers 
  * Handle other easily recognized things (such as date expressions)

*/


/* some standard library stuff we need */
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <cctype>

/* this is used for timing statistics and for guessing how 
   much time remains, feel free to remove all timing stuff
   if you don't have these functions */
#include <sys/times.h>

/* this is used to read corpus data faster,
   if you don't have these it should be trivial to
   change the code to use only standard library stuff */
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#if 1
/* If you have hash tables of some kind, use them. 
   Otherwise you might use std::map, which is much slower... */
#include <ext/hash_map>
namespace __gnu_cxx {
  template<> struct hash< std::string > {
    size_t operator()( const std::string& x ) const {
      return hash< const char* >()( x.c_str() );
    }
  };
  template<> struct hash< std::pair<int,int> > {
    size_t operator()( const std::pair<int,int>& x ) const {
      return hash<int>()(hash<int>()(x.first) + hash<int>()(x.second));
    }
  };
}
typedef __gnu_cxx::hash_map<std::string, int, __gnu_cxx::hash<std::string> > map_string_int;
typedef __gnu_cxx::hash_map<int, int, __gnu_cxx::hash<int> > map_int_int;
typedef __gnu_cxx::hash_map<std::pair<int,int>, std::vector<int> > map_pair_vector;
typedef __gnu_cxx::hash_map<std::pair<int,int>, int > map_pair_int;
#else 
/* if you have no hash tables, this uses only the C++ standard library,
   but is much slower */
#include <map>
typedef std::map<std::string, int> map_string_int;
typedef std::map<int, int> map_int_int;
typedef std::map<std::pair<int, int>, std::vector<int> > map_pair_vector;
typedef std::map<std::pair<int, int>, int> map_pair_int;
#endif

int XMS = 0, XML = 0; // these are used to calculate mean length of matches

const int WORD_INTERVAL = 1 << 14; // After WORD_INTERVAL words estimated time remaining will be printed
const int FORCE_MATCH_ABOVE = 2;  // Assign tags to context too if context words are more than FMA words from match edge
const int SHORTEST_COMPOUND = 6; // Size of shortest compound end allowed, 6 seems best with suffix as below
const int SUFFIX_LENGTH = 4;    // how many suffix characters to look at for unknown words, 4 seems best for Swedish

const int TRAINING_SIZE = 1200000;  // size (in tokens) of training data (if actual data is larger, program will crash)
const int LEXICON_SIZE = 100000;   // number of uniqe tokens in training data (crash if too small)
const int HAPAX_SIZE = 60000;     // number of tokens occuring only once in training data (crash if too small)
const int MAX_MATCHES = 10000;   // the highest number of matches ever to be encountered for a token
const int MAX_TEXT_SIZE = 70000;// largest possible size (in tokens) of text to tag
                               // it is ok to set all these too high, but it will waste some memory
std::vector<int> has_many_tags;// indicates wether token with a certain id is ambiguous in the training data
std::vector<int> hapax_pos;   // lists all positions where hapax tokens occur
int HAPAX;                   // the special token id used to indicate a hapax word

std::vector<std::vector<int> > hapax_suffix_pos; // lookup table for positions of all hapax words with a certain suffix
map_int_int index_of_text_to_suffix_index;      // token on 'index' in text has suffix 'suffix' 
map_string_int hapax_suffix_lex;               // suffix (letters) to int mapping        
map_pair_vector bigrams;                      // lookup table for all positions of token bigrams

inline int max(int a, int b) { return (a > b) ? a : b;}


int WS = 0;    // size of vector 'words'
int TS = 0;   // size of vector 'text'
int TLS = 0; // size of map 'tag_lex'

/* sentinel values, to be able to ignore bounds checking on
 * 'words' and 'text' vectors when matching
 * all three must have different values and these must be values
 * that no token can have 
 */
#define TEXT_SENTINEL_VALUE -2
#define CORPUS_SENTINEL_VALUE -3
#define UNKNOWN_WORD_INT -1

/* compile time flags, to change the behaviour of the tagger */
#define COMPOUNDS      // use compound analysis
#define VERBOSE       // print a lot of info to std::cerr
#define BACK_OFF     // use back-off
#define PRINT_SCORE // print score of matches when printing result

/* Type used for matches */
struct Match {
  Match(int i, int b, int a, int s) : word_index(i), before(b), after(a), score(s) {}
  Match() : word_index(-1), before(-1), after(-1), score(-100) {}
  int word_index; // index in training data
  int before;     // length of left context
  int after;      // length of right context
  int score;      // ranking score of this match
};

const int BOTH_SIDES_SCORE = 1 << 10;              // what minimum score will a two sided match generate
const int WORD_ONLY_SCORE = 1;                    // score of a match with no matching context
const int UNAMBIGUOUS_SCORE = 5000;              // How high should we rank words that we have only seen with one tag?
const int BACK_OFF_THRESHOLD = BOTH_SIDES_SCORE;// anything (equal to or) above this will not use back-off


/* A bunch of functions for calculating the score of a match.
 * A large part of the actual work is made up from these,
 * so there are several functions to try and gain a little 
 * extra speed.
 */
inline int score_l(int j, int i) {
  // we know j is at least 1
  if(i)
    return (i*j) << 10;
  else
    return j << 1;
}
inline int score_lo(int j) {
  // we know j is at least 1, and i is 0
  return j << 1;
}
inline int score_r(int j, int i) {
  // we know i is at least 1
  if(j)
    return (i*j) << 10;
  else
    return i << 1;
}
inline int score_ro(int i) {
  // we know i is at least 1, and j is 0
  return i << 1;
}

/* transform upper case characters to lower case characters
 * (standard library function (std::tolower)did not handle all Swedish
 * characters as desired, but might be ok for you. Very little time
 * is spent here, so performance is not critical.
 */
inline int my_tolower(const int i) {
  if(i >= 'a' && i <= 'z')
    return i;
  if(i >= 'A' && i <= 'Z')
    return i - 'A' + 'a';
  if(i == '')
    return '';
  if(i == '')
    return '';
  if(i == '')
    return '';
  else return i;
}

/* Function for loading corpus data.
 * Assumes each line has a word and a tag, separated by a tab (and nothing else).
 * Words and tags cannot contain tabs (or newlines).
 * Uses the mmap system call, for fast reading. Should be
 * trivial to modify to use only standard library routines (there is
 * an older version of the tagger that does this, if you want that code,
 * send an e-mail).
 */
void load_corpus(const std::string &filename,// file to read
		 std::vector<int> &words,    // vector to store corpus text in
		 std::vector<int> &tags,     // vector to store corpus tags in
		 std::vector<std::vector<int> > &word_pos, // positions of all occurences of each token stored here
		 map_string_int &word_int,        // map to translate strings to ints, for tokens
		 const map_string_int &tag_lexicon// map to translate strings to ints, for tags
		 ) { 
#ifdef VERBOSE
  std::cerr << "loading training corpus\n";
#endif
  
  /* mmap stuff... */
  struct stat buf;
  stat(filename.c_str(), &buf);
  int len = buf.st_size;
  int fildes = open(filename.c_str(), O_RDONLY);
  void *vp = mmap(0, len, PROT_READ, MAP_PRIVATE, fildes, 0);
  
  /* now we get the whole file in one big char array */
  char *p = static_cast<char *>(vp);

  const map_string_int::const_iterator tag_lexicon_end(tag_lexicon.end());
  int next_id = 0;
  int wordint, old_w;
  has_many_tags.resize(LEXICON_SIZE);       // no bounds checking done yet... will crash if training data too large
  std::vector<bool> is_hapax(LEXICON_SIZE);// no bounds checking done yet... will crash if training data too large
  map_string_int::iterator word_int_end = word_int.end();
  int i = 0, old_i, index = 0; 

  /* add sentinel values first (and later last) in training data,
     so we don't need to check the bounds when matching */
  words.push_back(CORPUS_SENTINEL_VALUE); index++;
  tags.push_back(CORPUS_SENTINEL_VALUE);

  std::vector<std::string> suffixv; suffixv.resize(LEXICON_SIZE, "");

  while(i < len) {
    old_i = i;
    while(p[i] != '\t')
      i++;
    std::string word(p + old_i, i - old_i);
    i++;
    old_i = i;
    while(p[i] != '\n')
      i++;
    std::string tagstr(p + old_i, i - old_i);
    while(p[i] == '\n')
      i++;
    std::transform(word.begin(),word.end(),word.begin(),my_tolower); // takes very little time
    map_string_int::iterator word_int_it = word_int.find(word);
    if(word_int_it == word_int_end) {
      wordint = next_id;
      word_int[word] = next_id++;
      is_hapax[wordint] = 1;
    } else {
      wordint = word_int_it->second;
      is_hapax[wordint] = 0;
    }

    suffixv[wordint] = word.substr(max(word.length() - SUFFIX_LENGTH,0));

    word_pos[wordint].push_back(index++);
    words.push_back(wordint);
    if(index > 2) {// if this is not the first word of all 
      std::pair<int,int> temp(old_w, wordint);
      bigrams[temp].push_back(index-2);
    }
    old_w = wordint;
    map_string_int::const_iterator tag = tag_lexicon.find(tagstr);
    if(tag == tag_lexicon_end) {
      std::cerr << "Error loading corpus: Unknown tag '" << tagstr << "'\n";
      tags.push_back(-2);
    } else
      tags.push_back(tag->second);

    // detect words that always have the same tag in the training
    // data (all matches will give the same result for these words)
    if(!has_many_tags[wordint] && tag->second != tags[word_pos[wordint][0]])
      has_many_tags[wordint] = 1;
  }


  /* as above, add sentinel value last in corpus,
     to skip bounds checking while matching */
  words.push_back(CORPUS_SENTINEL_VALUE); index++;
  tags.push_back(CORPUS_SENTINEL_VALUE);

  WS = words.size();

  /* Do some stuff with hapax words, used later when 
     non-compound unknown words are detected */
  HAPAX = next_id;
  hapax_pos.reserve(HAPAX_SIZE);

  int hapax_next_id = next_id + 1;
  hapax_suffix_pos.resize(HAPAX_SIZE);

  for(int i = 0; i < next_id; i++)
    if(is_hapax[i]) {
      int pos = word_pos[i][0];

      int id = HAPAX;
      std::string suffix = suffixv[i];
      if(hapax_suffix_lex.find(suffix) != hapax_suffix_lex.end())
	id = hapax_suffix_lex[suffix];
      else {
	id = hapax_next_id++;
	hapax_suffix_lex[suffix] = id;
      }
      hapax_suffix_pos[id-HAPAX].push_back(pos);
      if(pos < WS - 1) {
	bigrams[std::pair<int,int>(id,words[pos+1])].push_back(pos);
	if(is_hapax[words[pos+1]])
	  bigrams[std::pair<int,int>(id,HAPAX)].push_back(pos);
      }
      if(pos > 0) {
	bigrams[std::pair<int,int>(words[pos-1], id)].push_back(pos-1);
	if(is_hapax[words[pos-1]])
	  bigrams[std::pair<int,int>(HAPAX,id)].push_back(pos-1);
      }
      
      /* Note to self:
	 Perhaps treat hapax words as unknown words when tagging?
	 There are some errors on hapax words (which will always be
	 unambiguous in the training data) in the test data.
	 (quick experiment indicates tagging hapax words as unknown
	 gives much worse performance).
      */

      hapax_pos.push_back(pos);
      if(pos < WS - 1) {
	bigrams[std::pair<int,int>(HAPAX,words[pos+1])].push_back(pos);
	if(is_hapax[words[pos+1]])
	  bigrams[std::pair<int,int>(HAPAX,HAPAX)].push_back(pos);
      }
      if(pos > 0) {
	bigrams[std::pair<int,int>(words[pos-1], HAPAX)].push_back(pos-1);
	if(is_hapax[words[pos-1]])
	  bigrams[std::pair<int,int>(HAPAX,HAPAX)].push_back(pos-1);
      }
    }  
#ifdef VERBOSE
  std::cerr << next_id << " unique tokens in training data, " << hapax_pos.size() << " hapax\n";
#endif
}

/* load tag lexicon
 * Assumes one tag per line.
 */
void load_taglex(const std::string &filename, 
		 map_string_int &tag_lex, 
		 std::vector<std::string> &tag_lookup) {
#ifdef VERBOSE
  std::cerr << "loading tag lexicon" << std::endl;
#endif

  std::ifstream in(filename.c_str());
  int tag_no = 0;
  while(in) {
    std::string line;
    std::getline(in, line);
    // assume tag\n
    if(line.length() > 0) {
      std::string tag = line.substr(0,line.length());
      tag_lex[tag] = tag_no++;
      tag_lookup.push_back(tag);
    }
  }
  TLS = tag_lookup.size();
}

/* load the text we want to tag 
 * Tokenizes words with the default operator>>() (on whitespace
 * basically). Modify if required.
 */
void load_text(const std::string &filename, 
	       std::vector<int> &text,                  // text to tag
	       std::vector<std::string> &original_text,// as above, but preserve text and case, for printing result
	       std::vector<int> &res,                 // store result here (included to resize appropriately here)
	       map_string_int &word_int              // map from string to int
	       ) {
#ifdef VERBOSE
  std::cerr << "reading text from file " << filename << std::endl;
#endif
  std::ifstream in(filename.c_str());

  /* as usual, we want a sentinel value, so as to skip bounds
     checking when matching */
  text.push_back(TEXT_SENTINEL_VALUE);
  original_text.push_back("");
  int index = 0, found = 0, unknown = 0;

  while(in) {
    std::string w;
    in >> w;
    if(w.size() > 0) {
      index++;
      original_text.push_back(w);
      std::transform(w.begin(),w.end(),w.begin(),my_tolower);
      if(word_int.find(w) != word_int.end())
	text.push_back(word_int[w]);
      else { // unknown word
	unknown++;

	/*
	 * NOTE: For most actual uses you should probably handle numbers
	 * and so on better than it is done now... Numbers are generally
	 * easy to tag, even when they are unknown words, but Stomp will
	 * most likely behave very stupidly on them.
	 *
	 */

#ifdef COMPOUNDS
	std::string word;
	int i = 1, try_more = 1, found_split = 0;
	int word_size = w.size();
	if(word_size < SHORTEST_COMPOUND) {
	  try_more = 0;
	  if(w[0] == '!' 
	     || w[0] == '?'
	     || w[0] == '.') {
	    found_split = 1;
	    word = w[0];
	    i = 0;
	  }
	}
	// find longest suffix that is a word we know
	map_string_int::iterator word_int_end = word_int.end();
	while(try_more) {
	  word = w.substr(i);
	  if(word_int.find(word) != word_int_end) {
	    try_more = 0;
	    found_split = 1;
	  } else {
	    ++i;
	    if(i > word_size - SHORTEST_COMPOUND)
	      try_more = 0;
	  }
	}
	if(found_split) {
	  found++;
	  text.push_back(word_int[word]);
	  word_int[w] = word_int[word]; // save chosen compound rule
	}
	else {
	  text.push_back(UNKNOWN_WORD_INT);
	  std::string suffix = w.substr(max(w.length()-SUFFIX_LENGTH,0));
	  map_string_int::iterator it = hapax_suffix_lex.find(suffix);
	  if(it != hapax_suffix_lex.end())	    
	    index_of_text_to_suffix_index[index] = it->second;
	  else
	    index_of_text_to_suffix_index[index] = -1;
	}
#else // no COMPOUNDS
	text.push_back(UNKNOWN_WORD_INT);
	std::string suffix = w.substr(max(w.length()-SUFFIX_LENGTH,0));
	map_string_int::iterator it = hapax_suffix_lex.find(suffix);
	if(it != hapax_suffix_lex.end())	    
	  index_of_text_to_suffix_index[index] = it->second;
	else
	  index_of_text_to_suffix_index[index] = -1;
#endif
      }
    }
  }

  TS = text.size();
  res.resize(TS,-1);

  /* as usual, we want a sentinel value, so as to skip bounds
     checking when matching */
  text.push_back(TEXT_SENTINEL_VALUE);
  res[0] = TEXT_SENTINEL_VALUE;
  res.push_back(TEXT_SENTINEL_VALUE);

  // this data is not strictly true, since once a compound rule is guessed word is counted as known ...
  std::cerr << unknown << " unknown words in text, " << found << " probably compounds\n";
}

/* concatenate a directory name and a file name
 * change '/' to '\' or whatever you need, if required
 */
inline std::string dirFile(const std::string &dir, 
			   const std::string &file) {
  if(dir[dir.length()-1] == '/')
    if(file[0] == '/')
      return dir + file.substr(1);
    else
      return dir + file;
  else 
    if(file[0] == '/')
      return dir + file;
    else
      return dir + "/" + file;
}

/* print result
 */
void print(const std::vector<std::string> &text, 
	   const std::vector<int> &result, 
	   const std::vector<double> &length, 
	   const std::vector<std::string> &tag_lookup) {
#ifdef VERBOSE
  std::cerr << "printing result" << std::endl;
#endif
  int max_tag = TLS;
  for(int i = 1; i < TS; ++i) {
    std::cout << 
#ifdef PRINT_SCORE
      length[i] << "\t" << 
#endif
      text[i] << "\t";
    int res = result[i];
    if(res > max_tag) 
      std::cout << "tag.id.too.large.should.not.happen";
    else if(res > -1)
      std::cout << tag_lookup[res];
    else
      std::cout << "tag.id.negative.should.not.happen";
    std::cout << "\n";
  }
}

/* Below is a bunch of matching functions, that all do more or
 * less the same, but some are used when you know that at least
 * one word to the left/right will match, to gain a little speed
 * Most of the work of the tagger is done in the matching
 * functions.
 */

/* we know the centre and the next word match */
inline Match match_length_r(const std::vector<int> &text, 
			    const int &text_index,
			    const std::vector<int> &words,
			    const int &words_index
			    ) {
  // match forward
  int i = 2;
  while(words[words_index + i] == text[text_index + i]) {
    ++i;
  }
  // match backward
  int j = 1;
  while(words[words_index - j] == text[text_index - j]) {
    ++j;
  }
  Match m(words_index, j-1, i-1, score_r(j-1, i-1));
  return m;
}

/* we know the centre and the next word match, and that no
 * words match on the left side 
 */
inline Match match_length_ro(const std::vector<int> &text, 
			     const int &text_index,
			     const std::vector<int> &words,
			     const int &words_index
			     ) {
  // match forward
  int i = 2;
  while(words[words_index + i] == text[text_index + i]) {
    ++i;
  }
  Match m(words_index, 0, i-1, score_ro(i-1));
  return m;
}

/* we know the centre and the previous word match */
inline Match match_length_l(const std::vector<int> &text, 
			    const int &text_index,
			    const std::vector<int> &words,
			    const int &words_index
			    ) {
  // match forward
  int i = 1;
  while(words[words_index + i] == text[text_index + i]) {
    ++i;
  }
  // match backward
  int j = 2;
  while(words[words_index - j] == text[text_index - j]) {
    ++j;
  }
  Match m(words_index, j-1, i-1, score_l(j-1, i-1));
  return m;
}

/* we know the centre and the previous word match, and that no
 * words match on the right side 
 */
inline Match match_length_lo(const std::vector<int> &text, 
			     const int &text_index,
			     const std::vector<int> &words,
			     const int &words_index
			     ) {
  // match backward
  int j = 2;
  while(words[words_index - j] == text[text_index - j]) {
    ++j;
  }
  Match m(words_index, j-1, 0, score_lo(j-1));
  return m;
}


/* Add a match to the vector, but only keep the highest scoring matches.
 * A large part of the total running time is spent here.
 */
int Best_score = -1;
inline void add_match(std::vector<Match> &matches,
		      const Match &match) {
  if(match.score < Best_score) // check for the most common case first
    return;
  if(match.score > Best_score) {
    Best_score = match.score;
    matches.clear();
    matches.push_back(match);
    return;
  }
  matches.push_back(match);
}

/* Return the most common tag for a word. Positions of word occurences are stored in 'v'.
 * Caches result.
 */
inline int unigram_tag(const std::vector<int> &tags, const std::vector<int> &v, int max_no_tags) {
  static map_int_int cached_values;
  map_int_int::iterator it = cached_values.find(v[0]);
  if(it != cached_values.end())
    return it->second;

  std::vector<int> s(max_no_tags);
  for(unsigned int i = 0; i < v.size(); ++i) {
    ++s[tags[v[i]]];
  }
  int maxi = 0, max = 0;
  for(int i = 0; i < max_no_tags; ++i) {
    if(s[i] > max) {
      max = s[i];
      maxi = i;
    }
  }
  cached_values[v[0]] = maxi;
  return maxi;
}

// tag text
void tag(const std::vector<int> &text,                       // text to tag
	 std::vector<int> &result,                          // tagging result
	 std::vector<double> &length,                      // scores of matches
	 const std::vector<int> &words,                   // corpus text 
	 const std::vector<int> &tags,                   // corpus tags
	 const std::vector<std::vector<int> > &word_pos // positions where each token occurs
	 ) {
#ifdef VERBOSE
  std::cerr << "tagging text" << std::endl;
#endif
  int unknown = 0;
  int index = 1;
  bool already_done = 0;

  int ml = 0, ms = 0;

  length.resize(TS,0);
  std::vector<Match> matches; 
  matches.reserve(MAX_MATCHES);// set size to contain all matches for the most common word, will crash if set too small

  int *votes = new int[TLS];
  while(index < TS) {
    int current_word = text[index];
    if(result[index] >= 0) { // if word has been tagged already (as part of a very long match), do nothing
      already_done = 1;
    } else if(current_word >= 0 && !has_many_tags[current_word]) { 
      // if word is unambiguous in training data, we know the result
      result[index] = tags[word_pos[current_word][0]];
      length[index] = UNAMBIGUOUS_SCORE;
      already_done = 1;
    } else {
      matches.clear(); 
      Best_score = -1;
      bool dont_look_left = 0, dont_look_right = 0;
      if(current_word < 0) { // unknown word
	unknown++;
	// change unknown word to "fake" word based on suffix and hapax
	current_word = index_of_text_to_suffix_index[index]; 
	if(current_word < 0)	  
	  current_word = HAPAX; // if still unknown, "fake" as any hapax word
      }
      // build token bigram, more complicated than it seems, since we might
      // have unknown words in context and have to pretend they are known
      std::pair<int,int> w1,w2;
      if(index > 0)
	if(text[index-1] >= 0) // known word
	  w1 = std::pair<int,int>(text[index-1],current_word);
	else 
#if 0
	  w1 = std::pair<int,int>(HAPAX, current_word);
#else
  	  if(current_word >= HAPAX)
  	    w1 = std::pair<int,int>(HAPAX, current_word);
  	  else
 	    dont_look_left = 1;
#endif
      else
	dont_look_left = 1;
      if(index < TS - 1)
	if(text[index+1] >= 0) // known word
	  w2 = std::pair<int,int>(current_word, text[index+1]);
	else
#if 0
	  w2 = std::pair<int,int>(current_word, HAPAX);
#else
  	  if(current_word >= HAPAX)
  	    w2 = std::pair<int,int>(current_word, HAPAX);
  	  else
 	    dont_look_right = 1;
#endif
      else
	dont_look_right = 1;
      
      if(dont_look_left || bigrams.find(w1) ==  bigrams.end())
	dont_look_left = 1;
      if(dont_look_right || bigrams.find(w2) ==  bigrams.end())
	dont_look_right = 1;

      if(dont_look_left && dont_look_right) {
	if(text[index] >= 0) {
	    result[index] = unigram_tag(tags, word_pos[current_word], TLS);
	    length[index] = WORD_ONLY_SCORE;
	  } else {
	    int temp = index_of_text_to_suffix_index[index];
	    if(temp >= 0) {
	      result[index] = unigram_tag(tags, hapax_suffix_pos[temp - HAPAX], TLS);
	    } else {
	      result[index] = unigram_tag(tags, hapax_pos, TLS); 
	    }
	    length[index] = WORD_ONLY_SCORE;
	  }
	ml++; ms++;
	already_done = 1;
      } else if(dont_look_left && !dont_look_right) {
	std::vector<int> &b = bigrams[w2];
	std::vector<int>::const_iterator end = b.end();
	for(std::vector<int>::const_iterator it = b.begin();
	    it != end;
	    ++it)
	  add_match(matches, match_length_ro(text, index, words, *it));
      } else if(!dont_look_left && dont_look_right) {
	std::vector<int> &b = bigrams[w1];
	std::vector<int>::const_iterator end = b.end();
	for(std::vector<int>::const_iterator it = b.begin();
	    it != end;
	    ++it)
	  add_match(matches, match_length_lo(text, index, words, *it + 1));
      } else { //if(!dont_look_left && !dont_look_right) {
	const std::vector<int> &wp1 = bigrams[w1];
	const std::vector<int> &wp2 = bigrams[w2];
	if(wp1.size() < wp2.size()) {
	  std::vector<int>::const_iterator end = wp1.end();
	  for(std::vector<int>::const_iterator it = wp1.begin();
	      it != end;
	      ++it) 
	    add_match(matches, match_length_l(text, index, words, *it + 1));
	} else {
	  std::vector<int>::const_iterator end = wp2.end();
	  for(std::vector<int>::const_iterator it = wp2.begin();
	      it != end;
	      ++it)
	    add_match(matches, match_length_r(text, index, words, *it));
	} 
	if(Best_score < BOTH_SIDES_SCORE) { // we need to check the other side too
	  if(wp1.size() >= wp2.size()) {
	    std::vector<int>::const_iterator end = wp1.end();
	    for(std::vector<int>::const_iterator it = wp1.begin();
		it != end;
		++it)
	      add_match(matches, match_length_l(text, index, words, *it + 1));
	  } else {
	    std::vector<int>::const_iterator end = wp2.end();
	    for(std::vector<int>::const_iterator it = wp2.begin();
		it != end;
		++it) 
	      add_match(matches, match_length_r(text, index, words, *it));
	  }
	}
      }
    }
    if(already_done) { // if we had an unambiguous or unigram word, do nothing
      already_done = 0;
      ++index;
    } else {
      if(Best_score == -1) {
	std::cerr << "No matches? This should not happen...\n";
	index++;
      } else {
	Match best_match = matches[0];
	int MS = matches.size();
	if(MS > 1) {
	  memset(votes, 0, TLS*sizeof(int));
	  for(int i = 0; i < MS; i++) {
	    // currently voting only by chosen tag on focus word, seems suboptimal cmp. to old way... but faster
	    votes[tags[matches[i].word_index]]++;
	  }
	  int max = 0;
	  int maxi = 0;
 	  for(int i = 0; i < TLS; i++)
 	    if(votes[i] > max) {
 	      max = votes[i];
 	      maxi = i;
 	    }
	  for(int i = 0; i < MS; i++) 
	    if(tags[matches[i].word_index] == maxi) {
	      best_match = matches[i];
	      break;
	    }
	}
	if(best_match.before > FORCE_MATCH_ABOVE && best_match.after > FORCE_MATCH_ABOVE) {
	  // tag some words in the context too
	  for(int i = 0; i <= best_match.after - FORCE_MATCH_ABOVE; ++i) {
	    result[index + i] = tags[best_match.word_index + i];
	    length[index + i] = best_match.score;
	    
	    // save some statistics for mean length of matches
	    ml+=best_match.before + best_match.after + 1;ms++;
	    if(best_match.score >= BACK_OFF_THRESHOLD) {
	      XMS++; XML+=best_match.before + best_match.after + 1;
	    }
	  }
	  index += best_match.after - FORCE_MATCH_ABOVE;
	} else { // don't force context words to this match if this one is short
	  result[index] = tags[best_match.word_index];
	  length[index] = best_match.score;

	  // save some statistics for mean length of matches
	  ml+=best_match.before + best_match.after + 1;ms++;
	  if(best_match.score >= BACK_OFF_THRESHOLD) {
	    XMS++; XML+=best_match.before + best_match.after + 1;
	  }
	  index++;
	}	 
      }
    } 
  }
  std::cerr << "Tagging: " << ((float) ml) / ms << " (no unambiguous)\n";
#ifdef VERBOSE
  std::cerr << TS << " words tagged, " << unknown << " unknown words." << std::endl;
#endif
  delete [] votes;
}

#ifdef BACK_OFF
/* A bunch of functions for calculating the score of a match when
 * using the back-off method. As for score functions above, large part
 * of the actual work is made up from these, so there are several
 * functions to try and gain a little extra speed.
 */
inline int next_score_l(int j, int i) {
  // we know j is at least 1
  if(i)
    return (i*j) << 10;
  else
    return j << 1;
}
inline int next_score_r(int j, int i) {
  // we know i is at least 1
  if(j)
    return (i*j) << 10;
  else
    return i << 1;
}
inline int next_score(int k, int l) {
  if(k == 1) {
    if(l == 1) 
      return 1;
    else 
      return (l-1) <<1;
  } else {
    if(l == 1)
      return (k-1) << 1;
    else
      return ((k-1) * (l-1)) << 10;
  }
}
/* back-off matching
 * The same as normal matching, but continue matching
 * on tags when words no longer match. Match tags assigned
 * in the ordinary tagging step and tags in the training data.
 */
inline Match back_off_match_length_c(const std::vector<int> &text, 
				 const int &text_index,
				 const std::vector<int> &result,
				 const std::vector<int> &words,
				 const int &words_index,
				 const std::vector<int> &tags) {
  // match forward, first words (wich there should be none with the _c version), then tags
  int k = 1;
  while(tags[words_index + k] == result[text_index + k])
    ++k;

  // match backward, first words (none, as above), then tags
  int l = 1;
  while(tags[words_index - l] == result[text_index - l])
    ++l;

  Match m(words_index, l-1, k-1, next_score(k,l));
  return m;
}

inline Match back_off_match_length_l(const std::vector<int> &text, 
				 const int &text_index,
				 const std::vector<int> &result,
				 const std::vector<int> &words,
				 const int &words_index,
				 const std::vector<int> &tags) {
  // match forward, first words, then tags
  int i = 1;
  while(words[words_index + i] == text[text_index + i])
    ++i;

  int k=i;
  while(tags[words_index + k] == result[text_index + k])
    ++k;

  // match backward, first words, then tags
  int j = 2;
  while(words[words_index - j] == text[text_index - j])
    ++j;

  int l=j;
  while(tags[words_index - l] == result[text_index - l])
    ++l;

  Match m(words_index, l-1, k-1, next_score_l(l-1, k-1));
  return m;
}

inline Match back_off_match_length_lo(const std::vector<int> &text, 
				  const int &text_index,
				  const std::vector<int> &result,
				  const std::vector<int> &words,
				  const int &words_index,
				  const std::vector<int> &tags) {
  // match forward, first words, then tags
  int k=1;
  while(tags[words_index + k] == result[text_index + k])
    ++k;

  // match backward, first words, then tags
  int j = 2;
  while(words[words_index - j] == text[text_index - j])
    ++j;

  int l=j;
  while(tags[words_index - l] == result[text_index - l])
    ++l;

  Match m(words_index, l-1, k-1, next_score_l(l-1, k-1));
  return m;
}

inline Match back_off_match_length_r(const std::vector<int> &text, 
				 const int &text_index,
				 const std::vector<int> &result,
				 const std::vector<int> &words,
				 const int &words_index,
				 const std::vector<int> &tags) {
  // match forward, first words, then tags
  int i = 2;
  while(words[words_index + i] == text[text_index + i])
    ++i;

  int k=i;
  while(tags[words_index + k] == result[text_index + k])
    ++k;

  // match backward, first words, then tags
  int j = 1;
  while(words[words_index - j] == text[text_index - j])
    ++j;

  int l=j;
  while(tags[words_index - l] == result[text_index - l])
    ++l;

  Match m(words_index, l-1, k-1, next_score_r(l-1, k-1));
  return m;
}

inline Match back_off_match_length_ro(const std::vector<int> &text, 
				  const int &text_index,
				  const std::vector<int> &result,
				  const std::vector<int> &words,
				  const int &words_index,
				  const std::vector<int> &tags) {
  // match forward, first words, then tags
  int i = 2;
  while(words[words_index + i] == text[text_index + i])
    ++i;

  int k=i;
  while(tags[words_index + k] == result[text_index + k])
    ++k;

  // match backward, first words, then tags
  int l=1;
  while(tags[words_index - l] == result[text_index - l])
    ++l;

  Match m(words_index, l-1, k-1, next_score_r(l-1, k-1));
  return m;
}


/* back_off method
 * Will not do exactly what we want (matching on words and then tags),
 * will instead match on words and tags only on those positions where at 
 * least one word will match (if any such positions exist). Works backwards
 * from the end of the text, though any other order would also work.
 * Same arguments as void tag(...)
 */
void back_off(const std::vector<int> &text,
	      std::vector<int> &result,
	      std::vector<double> &length,  
	      const std::vector<int> &words,
	      const std::vector<int> &tags,
	      const std::vector<std::vector<int> > &word_pos
	      ) {
#ifdef VERBOSE
  std::cerr << "checking short matches again" << std::endl;
#endif
  // estimated time remaining related stuff
  struct tms time_vec[2];
  int last = TS;
  times(time_vec);

  // satistics stuff
  int checked_words = 0, retagged = 0;
  int ml = 0, ms = 0;

  int index = TS - 1;

  std::vector<Match> matches; // set size to contain all matches for the most common word
  matches.reserve(7000);

  int *votes = new int[TLS];
  std::vector<int>::iterator hapax_pos_end = hapax_pos.end();
  bool already_done = 0;
  while(index > 0) {
    // print time remaining (estimated)
    if(index < last - WORD_INTERVAL) {
      std::cerr << "word " << index;   
      last = index;
      times(time_vec+1);
      std::cerr << " (CPU)time remaining " 
		<< static_cast<int>(0.01 * (time_vec[1].tms_utime - time_vec[0].tms_utime 
					    + time_vec[1].tms_stime - time_vec[0].tms_stime) 
				    * last / (TS-last)) << " s " << std::endl;
    }

    int current_word = text[index];
    if(length[index] >= BACK_OFF_THRESHOLD ||
       (current_word >= 0 && !has_many_tags[current_word])) { // if word has a good enough matching already, do nothing
      already_done = 1;
    } else {
      checked_words++;
      matches.clear(); Best_score = -1;
      bool dont_look_left = 0, dont_look_right = 0;
      if(current_word < 0) { // unknown word
	current_word = index_of_text_to_suffix_index[index];
	if(current_word < 0)
	  current_word = HAPAX;
      }
      std::pair<int,int> w1,w2;
      if(index > 0)
	if(text[index-1] >= 0) // known word
	  w1 = std::pair<int,int>(text[index-1],current_word);
	else 
	  dont_look_left = 1;
      else
	dont_look_left = 1;
      if(index < TS - 1)
	if(text[index+1] >= 0) // known word
	  w2 = std::pair<int,int>(current_word, text[index+1]);
	else
	  dont_look_right = 1;
      else
	dont_look_right = 1;
      
      if(dont_look_left || bigrams.find(w1) ==  bigrams.end())
	dont_look_left = 1;
      if(dont_look_right || bigrams.find(w2) ==  bigrams.end())
	dont_look_right = 1;

      if(dont_look_left && dont_look_right) {
	// iterate over all word positions
	if(current_word >= HAPAX) {
	  if(current_word == HAPAX)
	    for(std::vector<int>::const_iterator it = hapax_pos.begin();
		it != hapax_pos_end;
		++it)
	      add_match(matches, back_off_match_length_c(text, index, result, words, *it, tags));
	  else {
	    std::vector<int>::const_iterator end = hapax_suffix_pos[current_word - HAPAX].end();
	    for(std::vector<int>::const_iterator it = hapax_suffix_pos[current_word - HAPAX].begin();
		it != end;
		++it)
	      add_match(matches, back_off_match_length_c(text, index, result, words, *it, tags));
	  }
	} else {
	  // this takes about half the time of back_off, and does some good
	  const std::vector<int> &wp = word_pos[current_word];
	  std::vector<int>::const_iterator wpe = wp.end();
	  for(std::vector<int>::const_iterator it = wp.begin();
	      it != wpe;
	      ++it)
	    add_match(matches, back_off_match_length_c(text, index, result, words, *it, tags));
	}
      } else if(dont_look_left && !dont_look_right) {
	// iterate over the bigram that exists
	const std::vector<int> &wp2 = bigrams[w2];
	std::vector<int>::const_iterator end = wp2.end();
	if(text[index] >= 0)
	  for(std::vector<int>::const_iterator it = wp2.begin();
	      it != end;
	      ++it) 
	    add_match(matches, back_off_match_length_ro(text, index, result, words, *it, tags));
	else
	  for(std::vector<int>::const_iterator it = wp2.begin();
	      it != end;
	      ++it) 
	    add_match(matches, back_off_match_length_r(text, index, result, words, *it, tags));
      }	else if(!dont_look_left && dont_look_right) {
	// iterate over the bigram that exists
	const std::vector<int> &wp1 = bigrams[w1];
	std::vector<int>::const_iterator end = wp1.end();
	if(text[index] >= 0)
	  for(std::vector<int>::const_iterator it = wp1.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_lo(text, index, result, words, *it + 1, tags));
	else
	  for(std::vector<int>::const_iterator it = wp1.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_l(text, index, result, words, *it + 1, tags));
      } else {
	// both bigrams exist, use both
	// this takes about half the time of back_off
	const std::vector<int> &wp1 = bigrams[w1];
	const std::vector<int> &wp2 = bigrams[w2];
	std::vector<int>::const_iterator end = wp1.end();
	if(text[index] >= 0) {
	  for(std::vector<int>::const_iterator it = wp1.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_lo(text, index, result, words, *it + 1, tags));
	  end = wp2.end();
	  for(std::vector<int>::const_iterator it = wp2.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_ro(text, index, result, words, *it, tags));
	} else { 
	  for(std::vector<int>::const_iterator it = wp1.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_l(text, index, result, words, *it + 1, tags));
	  end = wp2.end();
	  for(std::vector<int>::const_iterator it = wp2.begin();
	      it != end;
	      ++it)
	    add_match(matches, back_off_match_length_r(text, index, result, words, *it, tags));
	}
      }
    }
    if(already_done) {
      index--;
      already_done = 0;
    } else {
      if(matches.size() < 1) {
	std::cerr << "No matches? This should not happen...\n";
	index--;
      } else {
	Match best_match = matches[0];
	int MS = matches.size();
	if(MS > 1) {
	  memset(votes, 0, TLS*sizeof(int));
	  for(int i = 0; i < MS; i++) {
	    // currently voting only by chosen tag on focus word, seem suboptimal cmp. to old way... but faster
	    votes[tags[matches[i].word_index]]++; 
	  }
	  int max = 0;
	  int maxi = 0;
 	  for(int i = 0; i < TLS; i++)
 	    if(votes[i] > max) {
 	      max = votes[i];
 	      maxi = i;
 	    }
	  for(int i = 0; i < MS; i++) 
	    if(tags[matches[i].word_index] == maxi) {
	      best_match = matches[i];
	      break;
	    }
	}
	if(result[index] != tags[best_match.word_index]) {
	  retagged++;
	  result[index] = tags[best_match.word_index];
	  ml += best_match.before + best_match.after + 1; ms++;
	  /*
	  if(text[index] >= 0) // current_word is a known word
	    length[index] = best_match.score;
	  else
	    length[index] = 0;
	  */
	  length[index] = best_match.score;
	}
	index--;
      }
    } 
  }
  std::cerr << "Back-off: " << ((float) ml) / ms << " (no unambiguous)\n";
  std::cerr << "Total: " << ((float) ml+XML) / (ms+XMS) << " (no unambiguous)\n";
#ifdef VERBOSE
  std::cerr << checked_words << " words checked, " << retagged << " words retagged." << std::endl;
#endif
  delete [] votes;
}
#endif


int main(int argc, char **argv) {
  struct tms time_vec[10];

  times(time_vec);
  if(argc < 3) {
    std::cerr << "Usage: " << argv[0] << " <corpus directory> <file to be tagged>" << std::endl;
    return 0;
  }

  {
    std::fstream fin;
    fin.open(argv[2],std::ios::in);
    if( ! fin.is_open() )
      {
	std::cerr << "Could not open file: " << argv[2] << "\n";
	return 0;
      }
    fin.close();
  }

  std::vector<int> text;  // the text that we need to tag, words stored as ints
  std::vector<std::string> original_text;  // the text, stored as strings
  std::vector<int> result;  // the tags we assign
  std::vector<double> length;  // store scores for assigned tags


  std::vector<int> words; // store words in training data
  std::vector<int> tags;  // store tags in training data
  map_string_int tag_lexicon;  // map string to int, translate tags to ints
  std::vector<std::string> tag_lookup; // translate tags back to strings, map int to string

  original_text.reserve(MAX_TEXT_SIZE);
  text.reserve(MAX_TEXT_SIZE);
  map_string_int word_int; // integer representation for strings (for words)

  // load training data
  std::string corpusdir(argv[1]);

  {
    std::fstream fin;
    fin.open( dirFile(corpusdir, "taglex").c_str(), std::ios::in);
    if( ! fin.is_open() )
      {
	std::cerr << "Could not open file: " << dirFile(corpusdir, "taglex") << "\n";
	return 0;
      }
    fin.close();
  }
  load_taglex(dirFile(corpusdir, "taglex"), tag_lexicon, tag_lookup);
  words.reserve(TRAINING_SIZE); // reserve space for all words, this saves A LOT of time when reading training data
  tags.reserve(TRAINING_SIZE);
  std::vector<std::vector<int> > word_pos(LEXICON_SIZE); // list of indexes were a word occurs
  {
    std::fstream fin;
    fin.open( dirFile(corpusdir, "corpus").c_str(), std::ios::in);
    if( ! fin.is_open() )
      {
	std::cerr << "Could not open file: " << dirFile(corpusdir, "corpus") << "\n";
	return 0;
      }
    fin.close();
  }
  load_corpus(dirFile(corpusdir, "corpus"), words, tags, word_pos, word_int, tag_lexicon);
  times(time_vec+1);

  // read text
  load_text(argv[2], text, original_text, result, word_int);
  times(time_vec+2);

  // tag text
  tag(text, result, length, words, tags, word_pos);
  times(time_vec+3);

#ifdef BACK_OFF 
  // use back-off strategy for short matches
  back_off(text, result, length, words, tags, word_pos);
  times(time_vec+4);
#else
  times(time_vec+4);
#endif
  
  // print result
  print(original_text, result, length, tag_lookup);
  times(time_vec+5);

  // print some timing statistics on std::cerr 
  std::cerr << "Timing  \tuser time\tsystem time" << std::endl
	    << "Load corpus:\t" << 0.01 * (time_vec[1].tms_utime - time_vec[0].tms_utime) 
	    << "\t" << 0.01 * (time_vec[1].tms_stime - time_vec[0].tms_stime) 
	    << " (" << WS / (0.01 * (time_vec[1].tms_utime - time_vec[0].tms_utime 
					       + time_vec[1].tms_stime - time_vec[0].tms_stime)) 
	    << " words/s)" << std::endl
	  
	    << "Read text:\t" << 0.01 * (time_vec[2].tms_utime - time_vec[1].tms_utime) 
	    << "\t" << 0.01 * (time_vec[2].tms_stime - time_vec[1].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[2].tms_utime - time_vec[1].tms_utime 
					      + time_vec[2].tms_stime - time_vec[1].tms_stime)) 
	    << " words/s)" << std::endl
	  
	    << "Tag text:\t" << 0.01 * (time_vec[3].tms_utime - time_vec[2].tms_utime) 
	    << "\t" << 0.01 * (time_vec[3].tms_stime - time_vec[2].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[3].tms_utime - time_vec[2].tms_utime 
					      + time_vec[3].tms_stime - time_vec[2].tms_stime)) 
	    << " words/s)" << std::endl

#ifdef BACK_OFF 
	    << "Next sweep:\t" << 0.01 * (time_vec[4].tms_utime - time_vec[3].tms_utime) 
	    << "\t" << 0.01 * (time_vec[4].tms_stime - time_vec[3].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[4].tms_utime - time_vec[3].tms_utime 
					      + time_vec[4].tms_stime - time_vec[3].tms_stime)) 
	    << " words/s)" << std::endl
#endif

	    << "Print result:\t" << 0.01 * (time_vec[5].tms_utime - time_vec[4].tms_utime) 
	    << "\t" << 0.01 * (time_vec[5].tms_stime - time_vec[4].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[5].tms_utime - time_vec[4].tms_utime 
					      + time_vec[5].tms_stime - time_vec[4].tms_stime)) 
	    << " words/s)" << std::endl
	  
	    << "Total:  \t" << 0.01 * (time_vec[5].tms_utime - time_vec[0].tms_utime) 
	    << "\t" << 0.01 * (time_vec[5].tms_stime - time_vec[0].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[5].tms_utime - time_vec[0].tms_utime 
					      + time_vec[5].tms_stime - time_vec[0].tms_stime)) 
	    << " words/s)" << std::endl;
  std::cerr << "Total - load corpus:  \t" << 0.01 * (time_vec[5].tms_utime - time_vec[1].tms_utime) 
	    << "\t" << 0.01 * (time_vec[5].tms_stime - time_vec[1].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[5].tms_utime - time_vec[1].tms_utime 
					      + time_vec[5].tms_stime - time_vec[1].tms_stime)) 
	    << " words/s)" << std::endl;
#ifdef BACK_OFF
  std::cerr << "Tagging + back-off:  \t" << 0.01 * (time_vec[4].tms_utime - time_vec[2].tms_utime) 
	    << "\t" << 0.01 * (time_vec[4].tms_stime - time_vec[2].tms_stime) 
	    << " (" << TS / (0.01 * (time_vec[4].tms_utime - time_vec[2].tms_utime 
					      + time_vec[4].tms_stime - time_vec[2].tms_stime)) 
	    << " words/s)" << std::endl;
#endif
  return 0;
}
