/* 
 * API for splitting compounds, originally for Swedish, but
 * many methods should be more or less language independent.
 *
 * Requries Stava, with one modification to the standard Stava API.
 * Also requires some language specific data files.
 *
 *
 * Author of this file: Jonas Sjbergh
 */


/* 
 * See also the corresponding cpp-file, there are some 
 * data type choices that can be made there.
 */

/*
 * STL stuff that is used.
 */

#include <vector>
#include <string>

/* Init, must be called before any other method is called. Reads
 * datafiles and initializes things.
 *
 */
void init(const char * stava_lib_path,   // directory where Stava's data files are
	  const char * head_freq,        // frequency counts of compound heads
	  const char * tail_freq,        // frequency counts of compound tails
	  const char * ngram_freq,       // frequency counts of character n-grams in non-compound words
	  const char * wtl_freq,         // frequency counts of word-tag-lemma pairs
	  const char * tag_pair_freq    // frequency counts of head PoS and tail PoS pairs
	  );

/* A method to split compounds using a combination of several
 * ranking methods below. Other combinations might prove better
 * depending on the available data files or domain. It is relatively
 * straightforward to use a different combination, though it requires
 * changing the source code.
 */ 
void split_compounds(std::vector<std::string> &words,   // all words in the text
		     std::vector<std::string> &result   // results, non-compounds have "" here, 
		                                        // compounds have the word with '|' added, for instance "glass|skl"
		     );

/* Ranking functions
 *
 * All functions take a vector of strings and a vector of
 * floats. The vector of floats is used to return the results.
 */

/* Rank suggestions according to mean frequency of all components.
 * Higher scores indicate better suggestions.
 */
void freq(std::vector<std::string> &suggs, // suggestions
	  std::vector<float> &freq_scores // results
	  );

/* Words in the context.
 * Suggestions are ranked according to the frequencies of the components in the 
 * current context. Higher scores are better.
 */
void context(std::vector<std::string> &suggs,   // suggestions
	     std::vector<float> &context_scores,// results
	     std::vector<std::string> &words,   // all words
	     int index);                        // index of the current compound in the words vector

/* N-grams of characters. Suggestions are ranked according to the
 * frequencies of the character ngrams occuring over the suggested
 * component borders. The ngram frequencies are collected from non-compound
 * words, i.e. similar to the likelihood of seeing these characters
 * internally in a non-compound word. Lower scores are better?
 */
void ngram(std::vector<std::string> &suggs, // suggestions
	   std::vector<float> &ngram_scores // results
	   );

/* Part-of-speech (PoS) of compound components. Rank suggestions according
 * to how common the combination of the suggested word classes are,
 * i.e. noun-noun compounds are very common. Higher scores are better.
 */
void pospart(std::vector<std::string> &suggs,    // suggestions
	     std::vector<float> &pospart_scores  // results
	     );

/* Turn fotboll|s|lag into fotboll|lag, 
 * since the s is not really a compound 
 * component and can screw up some ranking
 * methods. Used by those methods that need it.
 */
std::string remove_s(const std::string &s);
