#!/bin/bash
# Copyright © 2012, marmuta
#
# This file is part of Onboard.
#
# Onboard is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Onboard is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Important directories:
#
# ./models/raw    - Input directory containing large raw models with only
#                   minimal processing (utf-8 encoded). Due to their large 
#                   size, often in the GB range, raw models aren't
#                   included with Onboard's source package, 
#
# ./models        - Output directory for final filtered system language models
#                   (utf-8 encoded).
#

LANGUAGES="en_US en_GB en_AU \
           de_DE de_CH de_AT \
           fr_FR \
           it_IT \
           es_ES \
           pt_PT pt_BR \
           ru_RU"

MODELDIR="models"
RAWMODELDIR="../training/raw_models"
MODELEXT="lm"

TRAIN_CMD=Onboard/pypredict/tools/train
FILTER_CMD=Onboard/pypredict/tools/filter

ORDER=2
VERBOSE=0

set -e

help()
{
cat >&2 << END
Usage: `basename $0` [-e|E] [-i|I] [-v] [languages...]
Script to create language models.
Options:
 -v  More verbose output

 -h  Show this help.
END
}


# process command line arguments
while getopts "v" opt; do
	case "$opt" in
	v)
		VERBOSE=1
		;;
	\?|*)
		help
		exit 1
		;;
	esac
done
shift $(($OPTIND - 1))


# get languages from positional parameters
[ $# -gt 0 ] && LANGUAGES="$*"

# make sure the directories exist
[ -d "$MODELDIR" ] || mkdir $MODELDIR

# filter language models
for lang in $LANGUAGES; do
    lang_id=${lang:0:2}
    MODEL_IN="$RAWMODELDIR/$lang_id.$MODELEXT.sorted-pruned"
    MODEL_OUT="$MODELDIR/$lang.$MODELEXT"

    echo "Building '$MODEL_OUT'..."

    FILTER_OPTIONS="--save-sorted $MODEL_IN $MODEL_OUT"

    case "$lang_id" in
        en)
            $FILTER_CMD \
                -p 499,499  \
                -r "\'s$|^[^IaA]$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -x Union,Kingdom,Nations,New,Barack,Obama,Bush,South,West,North,East,Southern,Western,Northern,Eastern,Mean,Standard,Coast,Time,Grand,Central,Station,Large,Cloud,Collider,Psychology,Geographic \
                -i 2.0 \
                ${FILTER_OPTIONS}                
            ;;

        de)
            Onboard/pypredict/tools/filter \
                -p 399  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 300.0 \
                ${FILTER_OPTIONS}                
            ;;

        fr)
            Onboard/pypredict/tools/filter \
                -p 169  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 150.0 \
                ${FILTER_OPTIONS}                
            ;;

        es)
            Onboard/pypredict/tools/filter \
                -p 149  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 50.0 \
                ${FILTER_OPTIONS}                
            ;;

        pt)
            Onboard/pypredict/tools/filter \
                -p 69  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 50.0 \
                ${FILTER_OPTIONS}                
            ;;

        it)
            Onboard/pypredict/tools/filter \
                -p 99  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 200.0 \
                ${FILTER_OPTIONS}                
            ;;

        ru)
            Onboard/pypredict/tools/filter \
                -p 249  \
                -r "\'s$|^\S$" \
                -n "^\w{1}\ | \s\S{1,3}(?:\s|$)" \
                -t \
                -l $lang \
                -N \
                -i 200.0 \
                ${FILTER_OPTIONS}                
            ;;

       *)
            echo "Unsupported language $lang, Aborting"
            exit 1
    esac

    echo

done


