Last active
January 23, 2024 19:21
-
-
Save norandom/86a701a56b7de8c800a83eac293da813 to your computer and use it in GitHub Desktop.
A wrapper around the Log2Vec tools https://github.com/NetManAIOps/Log2Vec/tree/master - Log File Vectorization https://because-security.atlassian.net/wiki/spaces/LML/pages/11927594/Log2Vec+conda+Python+3.9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
: ' | |
Copyright (c) 2024 Marius Ciepluch | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
' | |
# This script is a wrapper for the Log2Vec algorithm. | |
# It is based on the following paper and code: | |
# https://github.com/NetManAIOps/Log2Vec | |
# The license of the original code is included in the LICENSE file. | |
# Path: Log2Vec_script.sh in Log2Vec source root | |
set -e # exit on error | |
# userland Anaconda3 with log2vec environment | |
# https://gist.github.com/norandom/a1fd048d7d870a90aa72c9c45fd44e02 | |
# Documentation: https://because-security.atlassian.net/wiki/spaces/LML/pages/11927594/Log2Vec+conda+Python+3.9 | |
source ~/anaconda3/bin/activate | |
conda activate log2vec | |
# modify the following line to point to your log file | |
INPUT_FILE=$(realpath "/home/marius/source/sample_logs/syslog.log") | |
# test if the file can be found | |
if [ -f "$INPUT_FILE" ]; then | |
echo 'File input found... proceeding with Log2Vec.' | |
else | |
echo 'File not found. Please verify the INPUT_FILE :'"$INPUT_FILE" | |
exit 1 | |
fi | |
echo "Stage 1: Preprocessing: Variable removal" | |
# Filter variables within the logs | |
python code/preprocessing.py -rawlog "$INPUT_FILE" | |
# remove suffix with shell parameter expansion | |
# https://stackoverflow.com/a/965069/59300 | |
STAGE_ONE_STRING="_without_variables" | |
FILE_SUFFIX="${INPUT_FILE#*.}" | |
INPUT_FILE_WITHOUT_SUFFIX="${INPUT_FILE%%.*}" | |
echo "Stage 1 finished preprocessing" | |
STAGE_ONE_OUTPUT=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_ONE_STRING}.$FILE_SUFFIX | |
echo "===" | |
echo "Stage 2: Antonyms and synonyms - extraction" | |
# synonyms and antonyms get extracted | |
ANTONYMS=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_ONE_STRING}.antonyms | |
SYNONYMS=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_ONE_STRING}.synonyms | |
python code/get_syn_ant.py \ | |
-logs "$STAGE_ONE_OUTPUT" \ | |
-ant_file "$ANTONYMS" \ | |
-syn_file "$SYNONYMS" | |
echo "Synonyms: ""${SYNONYMS}" | |
echo "Antonyms: ""${ANTONYMS}" | |
echo "Stage 2 finished synonym and antonym extraction" | |
echo "===" | |
echo "Stage 3: Relation Triple Extraction" | |
# relation triple extraction | |
STAGE_THREE_STRING=".triplets" | |
STAGE_THREE_OUTPUT=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_THREE_STRING} | |
python code/get_triplet.py "$STAGE_ONE_OUTPUT" STAGE_THREE_OUTPUT >/dev/null | |
echo "Stage 3 finished relation triplet extraction" | |
echo "===" | |
echo "Stage 4: Semantic Word Embedding" | |
STAGE_FOUR_STRING=".training" | |
STAGE_FOUR_OUTPUT=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_FOUR_STRING} | |
python code/getTempLogs.py \ | |
-input "$STAGE_ONE_OUTPUT" \ | |
-output "$STAGE_FOUR_OUTPUT" | |
STAGE_FOUR_MODEL_SUFFIX=".model" | |
STAGE_FOUR_MODEL=${INPUT_FILE_WITHOUT_SUFFIX}${STAGE_FOUR_MODEL_SUFFIX} | |
STAGE_FOUR_VOCAB=${INPUT_FILE_WITHOUT_SUFFIX}".vocab" | |
echo "" | |
echo "" | |
echo "Model generation:" | |
echo "" | |
./code/LRWE/src/lrcwe \ | |
-train "$STAGE_FOUR_OUTPUT" \ | |
-synonym "$SYNONYMS" \ | |
-antonym "$ANTONYMS" \ | |
-output "$STAGE_FOUR_MODEL" \ | |
-save-vocab "$STAGE_FOUR_VOCAB" \ | |
-belta-rel 0.8 - alpha-rel 0.01 -alpha-ant 0.3 -size 32 -min-count 1 \ | |
-triplet "$STAGE_THREE_OUTPUT" | |
echo "Stage 4 finished semantic word embedding" | |
echo "===" | |
echo "Stage 5: handle OOV Logs" | |
echo "" | |
STAGE_FIVE_OUTPUT_ONE=${INPUT_FILE_WITHOUT_SUFFIX}".pkl" | |
# Read the original vector file | |
python code/mimick/make_dataset.py \ | |
--vectors "$STAGE_FOUR_MODEL" \ | |
--w2v-format \ | |
--output "$STAGE_FIVE_OUTPUT_ONE" | |
# train the new embedding according to oov | |
# .vector format | |
# Results of w2v, the first row is the number of rows and dimensions (can be omitted), | |
# the format of each subsequent row is word + word vector: word d1 d2... d32 | |
STAGE_FIVE_OUTPUT_TWO=${INPUT_FILE_WITHOUT_SUFFIX}.vector | |
# more words can be added with --vocab foo.txt in a line separated format | |
# here I use the .add suffix for simplicity. | |
# you need to add some words to the vocabulary, otherwise there wll be an error | |
ADDITIONAL_WORDS=${INPUT_FILE_WITHOUT_SUFFIX}.add | |
python code/mimick/model.py \ | |
--dataset "$STAGE_FIVE_OUTPUT_ONE" \ | |
--vocab "$ADDITIONAL_WORDS" \ | |
--output "$STAGE_FIVE_OUTPUT_TWO" | |
echo "Stage 5: finished handling OOV logs" | |
echo "===" | |
echo "Stage 6: generate log vectors" | |
python code/Log2Vec.py \ | |
-logs "$STAGE_ONE_OUTPUT" \ | |
-word_model "$STAGE_FOUR_MODEL" \ | |
-log_vector_file "$STAGE_FIVE_OUTPUT_TWO" \ | |
-dimension 32 | |
echo "Stage 6: finished generating log vectors" | |
echo "Mission Success!" |
Author
norandom
commented
Jan 19, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment