/* Rttstavningsprogram. Version 2.66  2016-07-24
   Copyright (C) 2016
   Viggo Kann och Jonas Sjbergh
   viggo@csc.kth.se
*/

/******************************************************************************

    This file is part of Stava.

    Stava is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Stava is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Stava.  If not, see <http://www.gnu.org/licenses/>.

******************************************************************************/

/* Some data files are stored in map-like structures. Preferably
 * hash tables. The C++ standard library map-class can be used,
 * but if more efficient data structures are available those might
 * be a better choice.
 */
#if 1
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <sstream>

#include <map>
#include <vector>
#include <string>
typedef std::map<std::string, int> map_string_int;
typedef std::map<std::string, float> map_string_float;
typedef std::map<std::string, map_string_int> map_string_map;
typedef std::map<std::string, map_string_float> map_string_map_float;
typedef std::map<std::string, std::vector<std::string> > map_string_vector;
#else
#include <ext/hash_map>
#include <vector>
#include <string>
namespace __gnu_cxx {
  template<> struct hash< std::string > {
    size_t operator()( const std::string& x ) const {
      return hash< const char* >()( x.c_str() );
    }
  };
}
typedef __gnu_cxx::hash_map<std::string, int, __gnu_cxx::hash<std::string> > map_string_int;
typedef __gnu_cxx::hash_map<std::string, float, __gnu_cxx::hash<std::string> > map_string_float;
typedef __gnu_cxx::hash_map<std::string, map_string_int, __gnu_cxx::hash<std::string> > map_string_map;
typedef __gnu_cxx::hash_map<std::string, map_string_float, __gnu_cxx::hash<std::string> > map_string_map_float;
typedef __gnu_cxx::hash_map<std::string, std::vector<std::string>, __gnu_cxx::hash<std::string> > map_string_vector;
#endif

/*
 * Dictionaries of frequencies used by some methods below. Should be 
 * initialized in InitSplitCompoundsData()
 */

static map_string_int head_freq;
static map_string_int tail_freq;
static map_string_vector taglex;
static map_string_map_float tag_pairs;
static map_string_int stop_words; // used in PoS of compound components

#include <algorithm>
#include <cstdlib>
#include <fstream>
extern "C" {
#include "stava.h"
#include "libstava.h"
#include "suffix.h"
}

#define HEAD_FREQ_FILENAME "head_frequencies.txt"
#define TAIL_FREQ_FILENAME "tail_frequencies.txt"
#define WTL_FREQ_FILENAME "cwtl"
#define TAG_PAIR_FREQ_FILENAME "pospair_frequencies.txt"

void InitSplitCompoundsData(const char * stava_lib_path,
	  const char * head_freq_filename,
	  const char * tail_freq_filename,
	  const char * wtl_freq_filename,
	  const char * tag_pair_freq_filename
	  ) {
  int libPathLen = strlen(stava_lib_path);
  char head_freq_path[strlen(head_freq_filename)+libPathLen+1];
  char tail_freq_path[strlen(tail_freq_filename)+libPathLen+1];
  char wtl_freq_path[strlen(wtl_freq_filename)+libPathLen+1];
  char tag_pair_freq_path[strlen(tag_pair_freq_filename)+libPathLen+1];
  sprintf(head_freq_path, "%s%s", stava_lib_path, head_freq_filename);
  sprintf(tail_freq_path, "%s%s", stava_lib_path, tail_freq_filename);
  sprintf(wtl_freq_path, "%s%s", stava_lib_path, wtl_freq_filename);
  sprintf(tag_pair_freq_path, "%s%s", stava_lib_path, tag_pair_freq_filename);

  // Head frequency data
  // expects each line to be: <compound head> <space or tab> <frequency count>
  std::ifstream hf(head_freq_path);
  while(hf) {
    std::string line;
    std::getline(hf, line);
    if(line.length() > 1) {
      size_t pos = line.find_first_of(" \t");
      std::string head = line.substr(0,pos);
      size_t pos2 = line.find_first_of(" \t\r\n",pos+1);
      int count = 0;
      if(pos2 != std::string::npos)
	count = atoi(line.substr(pos+1,pos2-pos-1).c_str());
      else
	count = atoi(line.substr(pos+1).c_str());
      head_freq[head] = count;
    }
  }
  // tail frequency data 
  // expects each line to be: <compound tail> <space or tab> <frequency count>
  std::ifstream tf(tail_freq_path);
  while(tf) {
    std::string line;
    std::getline(tf, line);
    if(line.length() > 1) {
      size_t pos = line.find_first_of(" \t");
      std::string tail = line.substr(0,pos);
      size_t pos2 = line.find_first_of(" \t\r\n",pos+1);
      int count = 0;
      if(pos2 != std::string::npos)
	count = atoi(line.substr(pos+1,pos2-pos-1).c_str());
      else
	count = atoi(line.substr(pos+1).c_str());
      tail_freq[tail] = count;
    }
  }
  // file with frequency - word - tag - lemma 
  // expects each line to be: <frequency count> <tab> <word> <tab> <PoS tag> <tab> <lemma>
  std::ifstream cwtl(wtl_freq_path);
  while(cwtl) {
    std::string line;
    std::getline(cwtl, line);
    if(line.length() > 1) {
      std::string word, tag;
      size_t pos1 = line.find("\t");
      size_t pos2 = line.find("\t", pos1 + 1);
      size_t pos3 = line.find("\t", pos2 + 1);
      word = line.substr(pos1+1, pos2-pos1-1);
      tag = line.substr(pos2+1, pos3-pos2-1);
      size_t pos = tag.find(".");
      if(pos != std::string::npos) 
	taglex[word].push_back(tag.substr(0,pos));
      else
	taglex[word].push_back(tag);
    }
  }
  for(map_string_vector::iterator it = taglex.begin();
      it != taglex.end();
      ++it) {
    std::sort(it->second.begin(), it->second.end());
    std::vector<std::string>::iterator new_end = std::unique(it->second.begin(), it->second.end());
    it->second.resize(new_end - it->second.begin());
  }

  // read tag - tag (tag-pair) frequencies
  // expects each line to be: 
  // <frequency count> <space or tab> <PoS tag of head> <space or tab> <PoS tag of tail> <space or tab> <frequency % (of whole file)>
  std::ifstream tag_pairsf(tag_pair_freq_path);
  while(tag_pairsf) {
    std::string line;
    std::getline(tag_pairsf, line);
    if(line.length() > 1) {
      size_t pos = line.find_first_of(" \t");
      size_t pos2 = line.find_first_of(" \t", pos+1);
      size_t pos3 = line.find_first_of(" \t", pos2+1);
      tag_pairs[line.substr(pos+1,pos2-pos-1)][line.substr(pos2+1, pos3-pos2-1)] = atof(line.substr(pos3+1).c_str());
    }
  }

  stop_words["het"] = 0;
  stop_words["hets"] = 0;
  stop_words["fr"] = 0;
  stop_words["ner"] = 0;
  stop_words["ande"] = 0;
  stop_words["da"] = 0;
  stop_words["te"] = 0;
  stop_words["aren"] = 0;
  stop_words["ans"] = 0;
  stop_words["or"] = 0;
  stop_words["ors"] = 0;
  stop_words["var"] = 0;
}


extern "C" {
void InitSplitCompoundsData(const char * lib_path) {
  InitSplitCompoundsData(lib_path, HEAD_FREQ_FILENAME, TAIL_FREQ_FILENAME, 
			 WTL_FREQ_FILENAME, TAG_PAIR_FREQ_FILENAME);
}

void InitStava(const char * stava_lib_path) {
  if (!StavaReadLexicon(stava_lib_path,1,1,0,0,0,0,(const unsigned char *)",")) {
    fprintf(stderr, "Kan inte initiera Stava\n");
    exit(1);
  }
}
}

// transform upper case characters to lower case characters
static inline int my_tolower(const int i) {
  if(i >= 'a' && i <= 'z')
    return i;
  if(i >= 'A' && i <= 'Z')
    return i - 'A' + 'a';
  if(i == '')
    return '';
  if(i == '')
    return '';
  if(i == '')
    return '';
  else return i;
}

static std::string remove_s(const std::string &s) {
  std::string res = "";
  
  unsigned int i = 0;
  while(i < s.length()) {
    if(s[i] != '|')
      res += s[i++];
    else {
      if(i+2 < s.length() &&
	 s[i+1] == 's' &&
	 s[i+2] == '|') {
	res += '|';
	i += 3;
      } else {
	res += s[i++];
      }
    }
  }
  return res;
}

static void freq(std::vector<std::string> &suggs, 
	  std::vector<float> &freq_scores) {
  float sum = 0;
  for(unsigned int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    size_t pos = -1;
    size_t pos2 = s.find('|');
    int f = 0, min_freq = 1000000;
    std::vector<int> v;
    while(pos2 != std::string::npos) {
      map_string_int::iterator it = head_freq.find(s.substr(pos+1,pos2-pos-1));
      if(it == head_freq.end())
	f = 0;
      else
	f = it->second;
      if(f < min_freq)
	min_freq = f;
      v.push_back(f);
      pos = pos2;
      pos2 = s.find('|', pos+1);
    }
    map_string_int::iterator it = tail_freq.find(s.substr(pos+1));
    if(it == tail_freq.end())
      f = 0;
    else
      f = it->second;
    if(f < min_freq)
      min_freq = f;
    v.push_back(f);
    
    float temp = 1, poss = 1.0;
    std::sort(v.begin(),v.end());
    for(unsigned int k = 0; k < v.size(); k++) {
      temp += v[k] / poss;
      poss *= 10000;
    }
  
    freq_scores[i] = temp; // originally just v[0]
    sum += freq_scores[i];
  }

  for(unsigned int i = 0; i < suggs.size(); i++) // normalize
    freq_scores[i] /= sum;
}

static void pospart(std::vector<std::string> &suggs, 
	     std::vector<float> &pospart_scores) {
  std::vector<std::string> NOT_KNOWN;
  NOT_KNOWN.push_back("NOT_KNOWN");
  float sum = 0;

  for(unsigned int i = 0; i < suggs.size(); i++) {
    std::string s = remove_s(suggs[i]);
    size_t pos = -1;
    size_t pos2 = s.find('|');
    float score = 1;

    std::vector<std::vector<std::string> *> heads;
    while(pos2 != std::string::npos) {
      std::string h = s.substr(pos+1,pos2-pos-1);
      if(stop_words.find(h) != stop_words.end())
	score /= 1000;
      map_string_vector::iterator it = taglex.find(h);
      if(it != taglex.end())
	heads.push_back(&(it->second));
      else {
	std::string word1 = h.substr(0,h.length()-1),
	  worda = h + "a",
	  word1a = word1 + "a";
	char last = h[h.length() -1];
	
	if((last == 's' 
	      || last == 'a'
	      || last == 'e'
	      || last == 'u'
	      || last == 'o'
	      || last == '-'
	      ) && taglex.find(word1) != taglex.end()) {
	  heads.push_back(&(taglex[word1]));
	} else if(taglex.find(worda) != taglex.end()) {
	  heads.push_back(&(taglex[worda]));
	} else if((last == 's' 
		   || last == 'a'
		   || last == 'e'
		   || last == 'u'
		   || last == 'o'
		   ) && taglex.find(word1a) != taglex.end()) {
	  heads.push_back(&(taglex[word1a]));
	} else {
	  heads.push_back(&NOT_KNOWN);
	}
      }
      pos = pos2;
      pos2 = s.find('|', pos+1);
    }
    std::string t = s.substr(pos+1);
    if(stop_words.find(t) != stop_words.end())
      score /= 1000;
    map_string_vector::iterator it = taglex.find(t);
    std::vector<std::string> *tail;
    if(it == taglex.end())
      tail = &NOT_KNOWN;
    else
      tail = &(it->second);

    for(unsigned int j = 0; j < heads.size(); j++) {
      float best = 0;
      for(unsigned int l = 0; l < heads[j]->size(); l++)
	for(unsigned int k = 0; k < tail->size(); k++) {
	  float curr = tag_pairs[(*heads[j])[l]][(*tail)[k]];
	  if(curr > best)
	    best = curr;
	}      
      score *= best;
    }
    pospart_scores[i] = score; // no need to adjust for no of components, all suggestion have the same number
    sum += pospart_scores[i];
  }
  if(sum > 0) {
    for(unsigned int i = 0; i < suggs.size(); i++) // normalize
      pospart_scores[i] /= sum;
  }
}

/* StavaSplitCompound analyzes a compund and splits it in the optimal way. 
   Before StavaSplitCompound is called the first time StavaReadLexicon/InitStava
   and InitSplitCompoundsData must have been called.
*/
void StavaSplitCompound(
			 char *res, /* result will appear here */
			 const char *cword) /* word to be analyzed */
{
    std::string word(cword);
    std::transform(word.begin(), word.end(),word.begin(), my_tolower);

    // check if it is ok as a non-compound
    // if not, split into all possible interpretations
    unsigned char longres[10000];
    int iscompound = StavaGetAllCompounds(longres, (const unsigned char *) (word.c_str()));

    if(iscompound) {
      map_string_vector::iterator ispm_it = taglex.find(word);
      bool ispm = 0; // is this a proper name? (if so, probably better not to split it)
      if(ispm_it != taglex.end()) {
	std::vector<std::string> &v = ispm_it->second;
	for(unsigned int j = 0; j < v.size(); j++)
	  if(v[j] == "pm")
	    ispm = 1;
      }
      if(!ispm) {
	std::vector<std::string> suggs;
	std::string sugg = (const char *) longres;
	int offset = 0;
	while(sugg != "") {
	  suggs.push_back(sugg);
#if 0
	  std::cout << "==" << sugg << "==\n";
#endif
	  offset += sugg.length() + 1;
	  sugg = (const char *) (longres + offset);
	}

	// discard all interpretations with many components
	std::vector<int> parts(suggs.size(),0);
	int minparts = suggs[0].length();
	for(unsigned int k = 0; k < suggs.size(); k++) {
	  for(unsigned int j = 0; j < suggs[k].length(); j++)
	    if(suggs[k][j] == '|' 
	       && (j < 2 
		   || !(suggs[k][j-1] == 's' && suggs[k][j-2] == '|')))
	      parts[k]++;
	  if(parts[k] < minparts)
	    minparts = parts[k];
	}
      
	int k = 0;
	for(unsigned int j = 0; j < suggs.size(); j++)
	  if(parts[j] == minparts) {
	    suggs[k] = suggs[j];
	    parts[k] = parts[j];
	    k++;
	  }
	suggs.resize(k);
	parts.resize(k);
      
	if(suggs.size() > 1) {

	  // run some methods and get the score of each suggestion, pick the one with the best overall score
	  std::vector<float> freq_scores(k,1);
	  freq(suggs, freq_scores);

	  std::vector<float> pospart_scores(k,1);
	  pospart(suggs, pospart_scores);

	  float best = 5*freq_scores[0] + 2*pospart_scores[0];
	  int best_index = 0;
	  for(int j = 1; j < k; j++) {
	    float score = 5*freq_scores[j] + 2*pospart_scores[j];
	    if(score > best) {
	      best = score;
	      best_index = j;
	    }
	  }

	  // handle the |s| (my manual annotation file prefers this if possible)
	  // change here if ...s|... is preferable over ...|s|...
	  // as in "jord|bruk(|)s|politik"
	  //                 ^^^
	  for(int j = 0; j < k; j++) {
	    size_t pos = suggs[j].find("|s|");
	    if(j != best_index 
	       && pos != std::string::npos
	       && suggs[j].substr(0,pos)+suggs[j].substr(pos+1) == suggs[best_index])
	      {
		best_index = j;
		break;
	      }
	     
	  }

	  strcpy(res, suggs[best_index].c_str());
	}
	else
	{ // only one suggestion
	  strcpy(res, suggs[0].c_str());
	}
      } else { // pm tag
	*res = '\0'; // = words[i];
      }
    } else { // not compound
      *res = '\0'; // = words[i];
    }
}

extern "C" {
void StavaAnalyzeCompound(
			 unsigned char *res, /* result will appear here */
			 const unsigned char *cword) /* word to be analyzed */
{
  StavaSplitCompound((char *) res, (const char *) cword);
}
}
