#!/usr/bin/sh

# this will create a lexicon directory in the current
# working directory

# files that need to be changed when using a different tagset:
#  taginfo (lists the tags, '+' means open wordclass, '-' means closed)
#  features (doesn't really need to be changed, but you will get a lot of
#            warnings otherwise)
#  inflection.rules
#  critical_words.wtl (contains words that must occur in the corpus, so the
#                      tagger can find its special tags)

if test $1 ; then
corpus=$1
echo creating lexicon from $corpus

# stuff needed for lexicon creation

tmp=/tmp/createlextmp.`hostname`.$$ # used for temporary storage
toolstemp=`pwd`/$0
toolsdir=`dirname $toolstemp`/..    # directory were freqstatmorf and wt2tt are found
not_created_now_dir=`dirname $toolstemp`/data # rules files etc. are found here

# create/clear tempdir
mkdir -p $tmp
rm -f "$tmp/*"

# change all chars to lower case
tr 'A-Z-' 'a-z-' <$corpus | sed 's/ro.mas.sin.ind\/def.gen/ro.gen/' > $tmp/wtl

# add words the tagger is hard coded to look for in lexicon (to find
# special tags)
cat $not_created_now_dir/critical_words.wtl >> $tmp/wtl

# remove foreign words
grep -v -y '	UO	' $tmp/wtl > $tmp/wtl.noUO

echo creating word counts files...
mkdir -p words
sort < $tmp/wtl.noUO >$tmp/wtl.sorted
uniq -c <$tmp/wtl.sorted | sort -n -r >words/cwtl
cut -f 1 <$tmp/wtl.sorted | uniq -c >words/cw

cut -f 2 words/cw | wc -lc >words/info
wc -l words/cwtl >>words/info

echo creating tag counts files...
mkdir -p tags
cut -f 2 < $tmp/wtl.noUO | sort | uniq -c >tags/ct 
rm $tmp/wtl.noUO
cut -f 3 <words/cwtl | sort | uniq -c >tags/ctm # overwritten further down, but "used" (read) by freqstatmorf
cut -f 1,2 $tmp/wtl | tee -a $tmp/wt | $toolsdir/wt2tt | grep -v -y 'UO' | sort | uniq -c >tags/ctt
rm $tmp/wtl

$toolsdir/wt2ttt <$tmp/wt | grep -v -y 'UO' | sort | uniq -c >tags/cttt
cut -f 2,3 words/cwtl > $tmp/wt.unique
cd tags
cp $not_created_now_dir/taginfo taginfo
cp $not_created_now_dir/features features
cd ..
wc -l tags/ct >tags/info
wc -l tags/ctt >>tags/info
wc -l tags/cttt >>tags/info
wc -l words/cwtl >>tags/info # is this better than using the whole cwtl, after more things added (later)?, yes

cut -f 2 <words/cw >$tmp/w.unique

rm $tmp/wt
cd words
cp $not_created_now_dir/compound-end-stop.w compound-end-stop.w
cp $not_created_now_dir/compound-begin-ok.w compound-begin-ok.w
cp $not_created_now_dir/intransitivaverb	    intransitivaverb
cp $not_created_now_dir/bitransitivaverb	    bitransitivaverb
cp $not_created_now_dir/feminina	feminina
cp $not_created_now_dir/opt_space_words	opt_space_words
cp $not_created_now_dir/spellNotOK	spellNotOK
cp $not_created_now_dir/spellOK	spellOK
cat /usr/dict/words $not_created_now_dir/english.w >foreign.w
cd ..

# create morf files, files with statistics of word endings (for guessing unknown words)
echo creating morf files...
mkdir -p morfs
$toolsdir/freqstatmorf <$tmp/wt.unique | sort > $tmp/morf

rm $tmp/wt.unique
uniq -c <$tmp/morf >morfs/cwt
cut -f 1 <$tmp/morf | uniq -c >morfs/cw
rm $tmp/morf
cut -f 2 morfs/cw | wc -lc >morfs/info
wc -l morfs/cwt >>morfs/info

echo fixing inflectlex...
cd words
cp $not_created_now_dir/inflection.rules inflection.rules
cp $not_created_now_dir/inflection.lex inflection.lex
cd ..

rm $tmp/*
rmdir $tmp

more words/info
more tags/info
more morfs/info

echo done
else
echo "usage: $0 <filename>"
fi