Last active
January 8, 2019 11:06
-
-
Save versusvoid/da4e71467a4c0f9e1a1c to your computer and use it in GitHub Desktop.
Download, extract and set up all things necessary to parse russian with malt and Serge Sharoff model (corpus.leeds.ac.uk/mocky/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
PREFIX=$(readlink -m $(dirname $0)) | |
if [[ ! $(command -v wget) && ! $(command -v curl) ]]; then | |
echo "You need either 'wget' or 'curl' programm to download necessary files" | |
exit 1 | |
fi | |
if [[ ! $(command -v perl) ]]; then | |
echo "You'll need some perl. Consider installing it." | |
exit 2 | |
fi | |
if [[ ! $(command -v make) || ! $(command -v g++) ]]; then | |
echo "Sorry to bug you, but you need 'make' and 'g++' to compile cstlemma." | |
exit 3 | |
fi | |
if [[ ! $(command -v java) ]]; then | |
echo "Well, you know that MaltParser is java application, don't you?" | |
exit 4 | |
fi | |
download() { | |
if [[ -f $1 ]]; then | |
return 0 | |
fi | |
if [[ $(command -v wget) ]]; then | |
wget $2 -P $3 | |
elif [[ $(command -v curl) ]]; then | |
cd $3 && { curl -O $2; cd -; } | |
else | |
echo "No suitable downloader, sorry =(" | |
exit 1 | |
fi | |
} | |
mkdir -p $PREFIX/archives | |
download $PREFIX/archives/malt-1.5.tar.gz \ | |
http://maltparser.org/dist/malt-1.5.tar.gz \ | |
$PREFIX/archives | |
download $PREFIX/archives/tree-tagger-linux-3.2.tar.gz \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.tar.gz \ | |
$PREFIX/archives | |
download $PREFIX/archives/cstlemma.zip \ | |
http://cst.dk/download/cstlemma/cstlemma.zip \ | |
$PREFIX/archives | |
mkdir -p $PREFIX/models | |
download $PREFIX/models/russian.par.gz \ | |
http://corpus.leeds.ac.uk/mocky/russian.par.gz \ | |
$PREFIX/models | |
download $PREFIX/models/rus-test.mco \ | |
http://corpus.leeds.ac.uk/mocky/rus-test.mco \ | |
$PREFIX/models | |
mkdir -p $PREFIX/scripts | |
download $PREFIX/scripts/russian-malt.tgz \ | |
http://corpus.leeds.ac.uk/mocky/russian-malt.tgz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/lemma-ru.tgz \ | |
http://corpus.leeds.ac.uk/mocky/lemma-ru.tgz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/tagger-scripts.tar.gz \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/install-tagger.sh \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/install-tagger.sh \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/smallutils.pm \ | |
http://corpus.leeds.ac.uk/tools/smallutils.pm \ | |
$PREFIX/scripts | |
rm -rf $PREFIX/installation | |
mkdir -p $PREFIX/installation | |
INSTALLATION=$(readlink -m $PREFIX/installation) | |
tar -xf $PREFIX/archives/malt-1.5.tar.gz -C $INSTALLATION | |
mv $INSTALLATION/malt-1.5/* $INSTALLATION | |
rmdir $INSTALLATION/malt-1.5 | |
mkdir -p $INSTALLATION/treetagger | |
cp $PREFIX/archives/tree-tagger-linux-3.2.tar.gz $INSTALLATION/treetagger | |
cp $PREFIX/scripts/tagger-scripts.tar.gz $INSTALLATION/treetagger | |
pushd $INSTALLATION/treetagger | |
sh $PREFIX/scripts/install-tagger.sh | |
popd | |
gunzip -k $PREFIX/models/russian.par.gz | |
mv $PREFIX/models/russian.par $INSTALLATION/treetagger/lib/russian-utf8.par | |
tar -xf $PREFIX/scripts/lemma-ru.tgz -C $INSTALLATION/treetagger/cmd | |
cp $PREFIX/scripts/smallutils.pm $INSTALLATION/treetagger/cmd/ | |
ln -s utf8-tokenize.perl $INSTALLATION/treetagger/cmd/utf8-tokenize.pl | |
sed -i "s#use lib('/corpora/tools'#use File::Basename;\nuse lib(dirname(\$0)#g" $INSTALLATION/treetagger/cmd/lemmatiser.pl | |
unzip $PREFIX/archives/cstlemma.zip -d $INSTALLATION/treetagger | |
pushd $INSTALLATION/treetagger | |
unzip ./cstlemma*.zip | |
pushd cstlemma*/cstlemma/src | |
make | |
popd | |
cp cstlemma*/cstlemma/src/cstlemma cmd/ | |
popd | |
cp $PREFIX/models/rus-test.mco $INSTALLATION | |
tar -xf $PREFIX/scripts/russian-malt.tgz -C $INSTALLATION | |
sed -i 's#/corpora/tools#$(dirname $0)#g' $INSTALLATION/russian-malt.sh | |
sed -i 's#^MALT=.*#MALT=$(dirname $0)#g' $INSTALLATION/russian-malt.sh | |
sed -i "s#russian.par#russian-utf8.par#g" $INSTALLATION/russian-malt.sh | |
sed -i 's#make-malt.pl#$(dirname $0)/make-malt.pl#g' $INSTALLATION/russian-malt.sh | |
sed -i 's#shake-malt.pl#$(dirname $0)/shake-malt.pl#g' $INSTALLATION/russian-malt.sh | |
sed -i 's/^#\$/$/g' $INSTALLATION/russian-malt.sh | |
sed -i 's#tmpmalttex#$MALT/tmpmalttex#g' $INSTALLATION/russian-malt.sh | |
echo 'Ну что ж, пришло время проверить наш парсер!' | $INSTALLATION/russian-malt.sh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment