-
-
Save mnvx/66a92a4347bd077f637c976f62417844 to your computer and use it in GitHub Desktop.
Download, extract and set up all things necessary to parse russian with malt and Serge Sharoff model (corpus.leeds.ac.uk/mocky/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
PREFIX=$(readlink -m $(dirname $0)) | |
if [[ ! $(command -v wget) && ! $(command -v curl) ]]; then | |
echo "You need either 'wget' or 'curl' programm to download necessary files" | |
exit 1 | |
fi | |
if [[ ! $(command -v perl) ]]; then | |
echo "You'll need some perl. Consider installing it." | |
exit 2 | |
fi | |
if [[ ! $(command -v make) || ! $(command -v g++) ]]; then | |
echo "Sorry to bug you, but you need 'make' and 'g++' to compile cstlemma." | |
exit 3 | |
fi | |
if [[ ! $(command -v java) ]]; then | |
echo "Well, you know that MaltParser is java application, don't you?" | |
exit 4 | |
fi | |
download() { | |
if [[ -f $1 ]]; then | |
return 0 | |
fi | |
if [[ $(command -v wget) ]]; then | |
wget $2 -P $3 | |
elif [[ $(command -v curl) ]]; then | |
cd $3 && { curl -O $2; cd -; } | |
else | |
echo "No suitable downloader, sorry =(" | |
exit 1 | |
fi | |
} | |
mkdir -p $PREFIX/archives | |
download $PREFIX/archives/malt-1.5.tar.gz \ | |
http://maltparser.org/dist/malt-1.5.tar.gz \ | |
$PREFIX/archives | |
download $PREFIX/archives/tree-tagger-linux-3.2.tar.gz \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.tar.gz \ | |
$PREFIX/archives | |
mkdir -p $PREFIX/models | |
download $PREFIX/models/russian.par.gz \ | |
http://corpus.leeds.ac.uk/mocky/russian.par.gz \ | |
$PREFIX/models | |
download $PREFIX/models/rus-test.mco \ | |
http://corpus.leeds.ac.uk/mocky/rus-test.mco \ | |
$PREFIX/models | |
mkdir -p $PREFIX/scripts | |
download $PREFIX/scripts/russian-malt.tgz \ | |
http://corpus.leeds.ac.uk/mocky/russian-malt.tgz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/lemma-ru.tgz \ | |
http://corpus.leeds.ac.uk/mocky/lemma-ru.tgz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/tagger-scripts.tar.gz \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/install-tagger.sh \ | |
http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/install-tagger.sh \ | |
$PREFIX/scripts | |
download $PREFIX/scripts/smallutils.pm \ | |
http://corpus.leeds.ac.uk/tools/smallutils.pm \ | |
$PREFIX/scripts | |
rm -rf $PREFIX/installation | |
mkdir -p $PREFIX/installation | |
INSTALLATION=$(readlink -m $PREFIX/installation) | |
tar -xf $PREFIX/archives/malt-1.5.tar.gz -C $INSTALLATION | |
mv $INSTALLATION/malt-1.5/* $INSTALLATION | |
rmdir $INSTALLATION/malt-1.5 | |
mkdir -p $INSTALLATION/treetagger | |
cp $PREFIX/archives/tree-tagger-linux-3.2.tar.gz $INSTALLATION/treetagger | |
cp $PREFIX/scripts/tagger-scripts.tar.gz $INSTALLATION/treetagger | |
pushd $INSTALLATION/treetagger | |
tar -xf tree-tagger-linux-3.2.tar.gz | |
sh $PREFIX/scripts/install-tagger.sh | |
popd | |
gunzip -k $PREFIX/models/russian.par.gz | |
mv $PREFIX/models/russian.par $INSTALLATION/treetagger/lib/russian-utf8.par | |
tar -xf $PREFIX/scripts/lemma-ru.tgz -C $INSTALLATION/treetagger/cmd | |
cp $PREFIX/scripts/smallutils.pm $INSTALLATION/treetagger/cmd/ | |
ln -s utf8-tokenize.perl $INSTALLATION/treetagger/cmd/utf8-tokenize.pl | |
sed -i "s#use lib('/corpora/tools'#use File::Basename;\nuse lib(dirname(\$0)#g" $INSTALLATION/treetagger/cmd/lemmatiser.pl | |
mkdir $INSTALLATION/treetagger/cstlemma | |
pushd $INSTALLATION/treetagger/cstlemma | |
wget https://raw.githubusercontent.com/kuhumcst/cstlemma/master/doc/makecstlemma.bash | |
chmod +x makecstlemma.bash | |
./makecstlemma.bash | |
popd | |
pushd $INSTALLATION/treetagger | |
pushd cstlemma*/cstlemma/src | |
make | |
popd | |
cp cstlemma*/cstlemma/src/cstlemma cmd/ | |
popd | |
cp $PREFIX/models/rus-test.mco $INSTALLATION | |
tar -xf $PREFIX/scripts/russian-malt.tgz -C $INSTALLATION | |
sed -i 's#/corpora/tools#$(dirname $0)#g' $INSTALLATION/russian-malt.sh | |
sed -i 's#^MALT=.*#MALT=$(dirname $0)#g' $INSTALLATION/russian-malt.sh | |
sed -i "s#russian.par#russian-utf8.par#g" $INSTALLATION/russian-malt.sh | |
sed -i 's#make-malt.pl#$(dirname $0)/make-malt.pl#g' $INSTALLATION/russian-malt.sh | |
sed -i 's#shake-malt.pl#$(dirname $0)/shake-malt.pl#g' $INSTALLATION/russian-malt.sh | |
sed -i 's/^#\$/$/g' $INSTALLATION/russian-malt.sh | |
sed -i 's#tmpmalttex#$MALT/tmpmalttex#g' $INSTALLATION/russian-malt.sh | |
echo 'Ну что ж, пришло время проверить наш парсер!' | $INSTALLATION/russian-malt.sh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment