/* Some data files are stored in map-like structures. Preferably
 * hash tables. The C++ standard library map-class can be used,
 * but if more efficient data structures are available those might
 * be a better choice.
 */
#if 1
#include <map>
#include <vector>
#include <string>
typedef std::map<std::string, int> map_string_int;
typedef std::map<std::string, float> map_string_float;
typedef std::map<std::string, map_string_int> map_string_map;
typedef std::map<std::string, map_string_float> map_string_map_float;
typedef std::map<std::string, std::vector<std::string> > map_string_vector;
#else
#include <ext/hash_map>
#include <vector>
#include <string>
namespace __gnu_cxx {
  template<> struct hash< std::string > {
    size_t operator()( const std::string& x ) const {
      return hash< const char* >()( x.c_str() );
    }
  };
}
typedef __gnu_cxx::hash_map<std::string, int, __gnu_cxx::hash<std::string> > map_string_int;
typedef __gnu_cxx::hash_map<std::string, float, __gnu_cxx::hash<std::string> > map_string_float;
typedef __gnu_cxx::hash_map<std::string, map_string_int, __gnu_cxx::hash<std::string> > map_string_map;
typedef __gnu_cxx::hash_map<std::string, map_string_float, __gnu_cxx::hash<std::string> > map_string_map_float;
typedef __gnu_cxx::hash_map<std::string, std::vector<std::string>, __gnu_cxx::hash<std::string> > map_string_vector;
#endif

/*
 * Dictionaries of frequencies used by some methods below. Should be 
 * initialized in init()
 */

static map_string_int head_freq;
static map_string_int tail_freq;
static map_string_int ngram_freq;
static map_string_vector taglex;
static map_string_map_float tag_pairs;
static map_string_int stop_words; // used in PoS of compound components

#include <algorithm>
#include <cstdlib>
#include <fstream>
extern "C" {
#include "stava.h"
#include "stavaapi.h"
#include "suffix.h"
}

void init(const char * stava_lib_path,
	  const char * head_freq_filename,
	  const char * tail_freq_filename,
	  const char * ngram_freq_filename,
	  const char * wtl_freq_filename,
	  const char * tag_pair_freq_filename
	  ) {

  if (!StavaReadLexicon(stava_lib_path,1,1,0,0,0,0,(const unsigned char *)",")) {
    fprintf(stderr, "Kan inte initiera Stava\n");
    exit(1);
  }

  // Head frequency data
  // expects each line to be: <compound head> <space or tab> <frequency count>
  std::ifstream hf(head_freq_filename);
  while(hf) {
    std::string line;
    std::getline(hf, line);
    if(line.length() > 1) {
      int pos = line.find_first_of(" \t");
      std::string head = line.substr(0,pos);
      int pos2 = line.find_first_of(" \t\r\n",pos+1);
      int count = 0;
      if(pos2 != std::string::npos)
	count = atoi(line.substr(pos+1,pos2-pos-1).c_str());
      else
	count = atoi(line.substr(pos+1).c_str());
      head_freq[head] = count;
    }
  }
  // tail frequency data 
  // expects each line to be: <compound tail> <space or tab> <frequency count>
  std::ifstream tf(tail_freq_filename);
  while(tf) {
    std::string line;
    std::getline(tf, line);
    if(line.length() > 1) {
      int pos = line.find_first_of(" \t");
      std::string tail = line.substr(0,pos);
      int pos2 = line.find_first_of(" \t\r\n",pos+1);
      int count = 0;
      if(pos2 != std::string::npos)
	count = atoi(line.substr(pos+1,pos2-pos-1).c_str());
      else
	count = atoi(line.substr(pos+1).c_str());
      tail_freq[tail] = count;
    }
  }
  // ngram frequency data 
  // expects each line to be: <character ngram> <space or tab> <frequency count>
  std::ifstream nf(ngram_freq_filename);
  while(nf) {
    std::string line;
    std::getline(nf, line);
    if(line.length() > 1) {
      int pos = line.find_first_of(" \t");
      std::string ngram = line.substr(0,pos);
      int pos2 = line.find_first_of(" \t\r\n",pos+1);
      int count = 0;
      if(pos2 != std::string::npos)
	count = atoi(line.substr(pos+1,pos2-pos-1).c_str());
      else
	count = atoi(line.substr(pos+1).c_str());
      ngram_freq[ngram] = count;
    }
  }
  // file with frequency - word - tag - lemma 
  // expects each line to be: <frequency count> <tab> <word> <tab> <PoS tag> <tab> <lemma>
  std::ifstream cwtl(wtl_freq_filename);
  while(cwtl) {
    std::string line;
    std::getline(cwtl, line);
    if(line.length() > 1) {
      std::string word, tag;
      int pos1 = line.find("\t");
      int pos2 = line.find("\t", pos1 + 1);
      int pos3 = line.find("\t", pos2 + 1);
      word = line.substr(pos1+1, pos2-pos1-1);
      tag = line.substr(pos2+1, pos3-pos2-1);
      int pos = tag.find(".");
      if(pos != std::string::npos) 
	taglex[word].push_back(tag.substr(0,pos));
      else
	taglex[word].push_back(tag);
    }
  }
  for(map_string_vector::iterator it = taglex.begin();
      it != taglex.end();
      ++it) {
    std::sort(it->second.begin(), it->second.end());
    std::vector<std::string>::iterator new_end = std::unique(it->second.begin(), it->second.end());
    it->second.resize(new_end - it->second.begin());
  }

  // read tag - tag (tag-pair) frequencies
  // expects each line to be: 
  // <frequency count> <space or tab> <PoS tag of head> <space or tab> <PoS tag of tail> <space or tab> <frequency % (of whole file)>
  std::ifstream tag_pairsf(tag_pair_freq_filename);
  while(tag_pairsf) {
    std::string line;
    std::getline(tag_pairsf, line);
    if(line.length() > 1) {
      int pos = line.find_first_of(" \t");
      int pos2 = line.find_first_of(" \t", pos+1);
      int pos3 = line.find_first_of(" \t", pos2+1);
      tag_pairs[line.substr(pos+1,pos2-pos-1)][line.substr(pos2+1, pos3-pos2-1)] = atof(line.substr(pos3+1).c_str());
    }
  }

  stop_words["het"] = 0;
  stop_words["hets"] = 0;
  stop_words["fr"] = 0;
  stop_words["ner"] = 0;
  stop_words["ande"] = 0;
  stop_words["da"] = 0;
  stop_words["te"] = 0;
  stop_words["aren"] = 0;
  stop_words["ans"] = 0;
  stop_words["or"] = 0;
  stop_words["ors"] = 0;
  stop_words["var"] = 0;
}

// transform upper case characters to lower case characters
inline int my_tolower(const int i) {
  if(i >= 'a' && i <= 'z')
    return i;
  if(i >= 'A' && i <= 'Z')
    return i - 'A' + 'a';
  if(i == '')
    return '';
  if(i == '')
    return '';
  if(i == '')
    return '';
  else return i;
}

std::string remove_s(const std::string &s) {
  std::string res = "";
  
  int i = 0;
  while(i < s.length()) {
    if(s[i] != '|')
      res += s[i++];
    else {
      if(i+2 < s.length() &&
	 s[i+1] == 's' &&
	 s[i+2] == '|') {
	res += '|';
	i += 3;
      } else {
	res += s[i++];
      }
    }
  }
  return res;
}

void freq(std::vector<std::string> &suggs, 
	  std::vector<float> &freq_scores) {
  float sum = 0;
  for(int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    int pos = -1;
    int pos2 = s.find('|');
    int f = 0, min_freq = 1000000;
    std::vector<int> v;
    while(pos2 != std::string::npos) {
      map_string_int::iterator it = head_freq.find(s.substr(pos+1,pos2-pos-1));
      if(it == head_freq.end())
	f = 0;
      else
	f = it->second;
      if(f < min_freq)
	min_freq = f;
      v.push_back(f);
      pos = pos2;
      pos2 = s.find('|', pos+1);
    }
    map_string_int::iterator it = tail_freq.find(s.substr(pos+1));
    if(it == tail_freq.end())
      f = 0;
    else
      f = it->second;
    if(f < min_freq)
      min_freq = f;
    v.push_back(f);
    
    float temp = 1, poss = 1.0;
    std::sort(v.begin(),v.end());
    for(int k = 0; k < v.size(); k++) {
      temp += v[k] / poss;
      poss *= 10000;
    }
  
    freq_scores[i] = temp; // originally just v[0]
    sum += freq_scores[i];
  }

  for(int i = 0; i < suggs.size(); i++) // normalize
    freq_scores[i] /= sum;
}

void context(std::vector<std::string> &suggs, 
	     std::vector<float> &context_scores, 
	     std::vector<std::string> &words, 
	     int index) {
  const int W = 50; // number of words in each direction to check
  
  map_string_float context_words;

  for(int i = 0; i < W; i++) {
    if(index + i < words.size()) {
      context_words[words[index+i]] += 1.0/i;
    }
    if(index - i >= 0) {
      context_words[words[index-i]] += 1.0/i;
    }
  }
  /* these are generally not correct, avoid */
  context_words["het"] = 0;
  context_words["hets"] = 0;
  context_words["fr"] = 0;
  context_words["ner"] = 0;
  context_words["ande"] = 0;
  context_words["da"] = 0;
  context_words["te"] = 0;
  context_words["aren"] = 0;
  context_words["ans"] = 0;
  context_words["or"] = 0;
  context_words["ors"] = 0;
  context_words["var"] = 0;

  float sum = 0;
  for(int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    int pos = -1;
    int pos2 = s.find('|');
    float f = 1.0/W; // we want to avoid 0 when dividing later
    while(pos2 != std::string::npos) {
      f += context_words[s.substr(pos+1,pos2-pos-1)];
      pos = pos2;
      pos2 = s.find('|', pos+1);
    }
    f += context_words[s.substr(pos+1)];
    context_scores[i] = f;
    sum += f;
  }

  for(int i = 0; i < suggs.size(); i++) // normalize
    context_scores[i] /= sum;
}

void ngram(std::vector<std::string> &suggs, 
	   std::vector<float> &ngram_scores) {
  const int N = 4; // ngram length
  float sum = 0;
  int maxf = 0;
  int tripples = 0;
  for(int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    int f = 1; // we want to avoid 0 in division later
    int pos = s.find('|');
    while(pos != std::string::npos) {
      for(int j = 0; j < N-1; j++) {
	int start = pos - N + 1 + j;
	std::string ng = "";
	int k = 0;
	while(ng.length() < N) {
	  if(start + k >= s.length())
	    break;
	  if(s[start + k] != '|') { // this only works for '|' after the '|' we check now...
	    if(ng.length() > 1 
	       && ng[ng.length()-1] == s[start + k]
	       && ng[ng.length()-2] == s[start + k])
	      tripples++; // skip tripple consonant
	    else
	      ng += s[start+k];
	  }
	  k++;
	}
	map_string_int::iterator it = ngram_freq.find(ng);
	if(it != ngram_freq.end()) {
	  f += it->second;
	}
      }
      pos = s.find('|', pos+1);
    }

    ngram_scores[i] = f;
    sum += ngram_scores[i];
  }
  if(tripples % suggs.size())
    for(int i = 0; i < suggs.size(); i++) // don't vote if there are ss|s vs. s|s or similar
      ngram_scores[i] /= 1.0/suggs.size();
  else
    for(int i = 0; i < suggs.size(); i++) // normalize
      ngram_scores[i] /= sum;
}

void pospart(std::vector<std::string> &suggs, 
	     std::vector<float> &pospart_scores) {
  std::vector<std::string> NOT_KNOWN;
  NOT_KNOWN.push_back("NOT_KNOWN");
  float sum = 0;

  for(int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    int pos = -1;
    int pos2 = s.find('|');
    float score = 1;

    std::vector<std::vector<std::string> *> heads;
    while(pos2 != std::string::npos) {
      std::string h = s.substr(pos+1,pos2-pos-1);
      if(stop_words.find(h) != stop_words.end())
	score /= 1000;
      map_string_vector::iterator it = taglex.find(h);
      if(it != taglex.end())
	heads.push_back(&(it->second));
      else {
	std::string word1 = h.substr(0,h.length()-1),
	  worda = h + "a",
	  word1a = word1 + "a";
	char last = h[h.length() -1];
	
	if((last == 's' 
	      || last == 'a'
	      || last == 'e'
	      || last == 'u'
	      || last == 'o'
	      || last == '-'
	      ) && taglex.find(word1) != taglex.end()) {
	  heads.push_back(&(taglex[word1]));
	} else if(taglex.find(worda) != taglex.end()) {
	  heads.push_back(&(taglex[worda]));
	} else if((last == 's' 
		   || last == 'a'
		   || last == 'e'
		   || last == 'u'
		   || last == 'o'
		   ) && taglex.find(word1a) != taglex.end()) {
	  heads.push_back(&(taglex[word1a]));
	} else {
	  heads.push_back(&NOT_KNOWN);
	}
      }
      pos = pos2;
      pos2 = s.find('|', pos+1);
    }
    std::string t = s.substr(pos+1);
    if(stop_words.find(t) != stop_words.end())
      score /= 1000;
    map_string_vector::iterator it = taglex.find(t);
    std::vector<std::string> *tail;
    if(it == taglex.end())
      tail = &NOT_KNOWN;
    else
      tail = &(it->second);

    for(int j = 0; j < heads.size(); j++) {
      float best = 0;
      for(int l = 0; l < heads[j]->size(); l++)
	for(int k = 0; k < tail->size(); k++) {
	  float curr = tag_pairs[(*heads[j])[l]][(*tail)[k]];
	  if(curr > best)
	    best = curr;
	}      
      score *= best;
    }
    pospart_scores[i] = score; // no need to adjust for no of components, all suggestion have the same number
    sum += pospart_scores[i];
  }
  for(int i = 0; i < suggs.size(); i++) // normalize
    pospart_scores[i] /= sum;
}


  // combine the method of Stava (throw away all many-part compounds)
  // n-grams (most unlikely n-gram in non-compound)
  // words in context (compound components occuring in context, RI helps but is not available?)
  // frequencies of components (highest min freq or highest mean freq?)

void split_compounds(std::vector<std::string> &words, 
		     std::vector<std::string> &result) {
  int SIZE = words.size();
  result.resize(SIZE);

  for(int i = 0; i < SIZE; i++) {
    std::string word = words[i];
    std::transform(word.begin(), word.end(),word.begin(), my_tolower);

    // check if it is ok as a non-compound
    // if not, split into all possible interpretations
    unsigned char res[10000];
    int iscompound = StavaGetAllCompounds(res, (const unsigned char *) (word.c_str()));

    if(iscompound) {
      map_string_vector::iterator ispm_it = taglex.find(word);
      bool ispm = 0; // is this a proper name? (if so, probably better not to split it)
      if(ispm_it != taglex.end()) {
	std::vector<std::string> &v = ispm_it->second;
	for(int j = 0; j < v.size(); j++)
	  if(v[j] == "pm")
	    ispm = 1;
      }
      if(!ispm) {
	std::vector<std::string> suggs;
	std::string sugg = (const char *) res;
	int offset = 0;
	while(sugg != "") {
	  suggs.push_back(sugg);
	  offset += sugg.length() + 1;
	  sugg = (const char *) (res + offset);
	}

	// discard all interpretations with many components
	std::vector<int> parts(suggs.size(),0);
	int minparts = suggs[0].length();
	for(int k = 0; k < suggs.size(); k++) {
	  for(int j = 0; j < suggs[k].length(); j++)
	    if(suggs[k][j] == '|' 
	       && (j < 2 
		   || !(suggs[k][j-1] == 's' && suggs[k][j-2] == '|')))
	      parts[k]++;
	  if(parts[k] < minparts)
	    minparts = parts[k];
	}
      
	int k = 0;
	for(int j = 0; j < suggs.size(); j++)
	  if(parts[j] == minparts) {
	    suggs[k] = suggs[j];
	    parts[k] = parts[j];
	    k++;
	  }
	suggs.resize(k);
	parts.resize(k);
      
	if(suggs.size() > 1) {

#if 0
	  // run some methods and get the score of each suggestion, pick the one with the best overall score
	  std::vector<float> freq_scores(k,1);
	  freq(suggs, freq_scores);

	  std::vector<float> pospart_scores(k,1);
	  pospart(suggs, pospart_scores);

	  float best = 5*freq_scores[0] + 2*pospart_scores[0];
	  int best_index = 0;
	  for(int j = 1; j < k; j++) {
	    float score = 5*freq_scores[j] + 2*pospart_scores[j];
	    if(score > best) {
	      best = score;
	      best_index = j;
	    }
	  }
#else
	  // run all methods and get the score of each suggestion, pick the one with the best overall score
	  std::vector<float> ngram_scores(k,1);
	  ngram(suggs, ngram_scores); // this one has problems with remiss|svar

	  std::vector<float> freq_scores(k,1);
	  freq(suggs, freq_scores);

	  std::vector<float> pospart_scores(k,1);
	  pospart(suggs, pospart_scores);

	  std::vector<float> context_scores(k,1);
	  context(suggs, context_scores, words, i);

	  float best = context_scores[0] + 5*freq_scores[0] - ngram_scores[0] + 2*pospart_scores[0];
	  int best_index = 0;
	  for(int j = 1; j < k; j++) {
	    float score = context_scores[j] + 5*freq_scores[j] - ngram_scores[j] + 2*pospart_scores[j];
	    if(score > best) {
	      best = score;
	      best_index = j;
	    }
	  }
#endif
	  // handle the |s| (my manual annotation file prefers this if possible)
	  // change here if ...s|... is preferable over ...|s|...
	  // as in "jord|bruk(|)s|politik"
	  //                 ^^^
	  for(int j = 0; j < k; j++) {
	    int pos = suggs[j].find("|s|");
	    if(j != best_index 
	       && pos != std::string::npos
	       && suggs[j].substr(0,pos)+suggs[j].substr(pos+1) == suggs[best_index])
	      {
		best_index = j;
		break;
	      }
	     
	  }

	  result[i] = suggs[best_index];
	} else { // only one suggestion
	  result[i] = suggs[0];
	}
      } else { // pm tag
	result[i] = ""; // = words[i];
      }
    } else { // not compound
      result[i] = ""; // = words[i];
    }
  }
}

#if 0

// This is a modification of Stava that the split compound stuff needs.
// This code should be placed in the Stava files.

/* StavaGetAllCompounds analyzes a compund. 
   Before it is called the first time StavaReadLexicon
   must have been called.

   Find all possible compound interpretations of a word,
   unless the word is found as a non-compound in the dictionary.
*/
int StavaGetAllCompounds(
			 unsigned char *res, /* result will appear here, and it will be many '\0'-terminated strings, ended with two consecutive '\0' */
			 const unsigned char *word) /* word to be analyzed */
{ unsigned char buf[LANGD + 3], ord[LANGD + 3], Ord[LANGD + 3];
  unsigned char breaks[LANGD + 3];
  int i, len, bindestreck = 0;
  unsigned char * start;

  res[0] = 0;
  res[1] = 0;

  if(InILorELbutnotUL(word, strlen(word)))
    return 0;

  strcpy(res, word);
  for (i = 0; i < LANGD; i++) {
    breaks[i] = ' ';
    if (!(buf[i] = bokstavsTabell[word[i]])) break;
    if (buf[i] == '-') bindestreck = 1;
  }
  breaks[i] = '\0';
  if (i == LANGD) return 0; /* too long word */
  if (i < ORDMIN) return 0; /* short words are always accepted */
  len = i;
  if (InILorELbutnotUL(buf, len) != 0) return 0;
  if (xAndelser && CheckSuffix(buf, 0)) return 0;
  noofcompounds = 0;
  compoundSearch = 1;
  IsCompound(buf, breaks, len);
  if (noofcompounds == 0) {
    VersalerGemena(buf,ord,Ord);
    if (*ord) IsCompound(ord, breaks, len);
    if (*Ord && noofcompounds == 0) IsCompound(Ord, breaks, len);
  }
  if (bindestreck && noofcompounds == 0 ) {
    unsigned char *ordin = buf, *t, totbreaks[LANGD + 3], *b;
    int breakstartpos, ordlen;
    for (i = 0; i < LANGD; i++) totbreaks[i] = ' ';
    totbreaks[i] = '\0';
    do {
      t = ord; b = breaks;
      breakstartpos = ordin - buf;
      while (*ordin && *ordin != '-') {
	*t++ = *ordin++;
	*b++ = ' ';
      }
      *t = *b = '\0';
      ordlen = strlen(ord);
      if (ordlen >= ORDMIN &&
	  !InILorELbutnotUL(ord, ordlen) &&
	  (!xAndelser || !CheckSuffix(ord, 0))) {
	noofcompounds = 0;
	IsCompound(ord, breaks, ordlen);
	if (noofcompounds > 0) {
	  breakpossibilities[0][ordlen] = '\0';
	  strcpy(totbreaks + breakstartpos, breakpossibilities[0]);
	  if (*ordin == '-') totbreaks[ordin - buf] = ' ';
	}
      }
    } while (*ordin++);
    noofcompounds = 1;
    totbreaks[len] = '\0';
    strcpy(breakpossibilities[0], totbreaks);
  }
  compoundSearch = 0;
  if (noofcompounds == 0) return 0;


  start = res;
  for(i = 0; i < noofcompounds; i++) {
    StringWriteCompound(start, word, breakpossibilities[i]);
    len = strlen(start);
    start[len] = 0;
    start[len+1] = 0;
    start = start + len + 1;
  }
  return 1;
}
#endif
