This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
similarity = embed_model.wv.similarity('sorted', 'ord') | |
most_similar = embed_model.wv.most_similar('len(x)') | |
vector = embed_model.wv['for'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import FastText | |
embed_model = FastText(vector_size=meta_hyper['vector_size'], | |
window=meta_hyper['window'], | |
min_count = meta_hyper['min_count'], | |
alpha= meta_hyper['alpha'], | |
workers=meta_hyper['CPU']) | |
embed_model.build_vocab(tokenized_data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
meta_hyper = { | |
"vector_size": 192, # size of embedding | |
"alpha": 0.025, # learning rate | |
"window": 5, | |
"min_freq" : 5, | |
"epochs": 300, # number of training epochs | |
"vocab_size": len(set([*td for td in tokenized_data])), #size of vocabulary | |
"data_description": "Add your description here", | |
"data_size": len(tokenized_data), | |
"tokens_number": len([*td for td in tokenized_data]), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rootdir = "PATH TO THE DIRECTORY OF THE PROJECT YOU CLONED FROM GITHUB" | |
files_code = load_all_files(root_dir) | |
tokenized_data = [] | |
for d in files_code: | |
try: | |
tokenized_data.append(tokenize_python(d)) | |
except: | |
print('error') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.tokenize import word_tokenize | |
from tokenize import tokenize | |
from io import BytesIO | |
rename_globals = {} | |
def tokenize_python(code): | |
g = tokenize(BytesIO(code.encode('utf-8')).readline) | |
try: | |
tokens = [c[1] for c in g if c[1]!='' and c[1]!='\n'][1:] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_all_files(rootdir): | |
files_code = [] | |
for subdir, dirs, files in os.walk(rootdir): | |
for file in files: | |
if file.endswith('.py'): | |
with open(os.path.join(subdir, file)) as pf: | |
try: | |
code = pf.read() | |
files_code.append(code) | |
except: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sumy | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.parsers.plaintext import PlaintextParser | |
#LSA algorithm | |
from sumy.summarizers.lsa import LsaSummarizer | |
#text: text to summarize | |
#no_sentences: number of sentences in your summary, | |
#lang: language of text | |
def lsa_summary(text, no_sentences, lang): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#do a lot of imports | |
from keras.models import Sequential | |
from keras.layers import LSTM, Dense | |
import numpy as np | |
from os import listdir | |
import os.path | |
import json | |
from sklearn.preprocessing import OneHotEncoder |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inst_groups = { | |
#Conditional Data Transfer | |
'cdt': ['cmove', 'cmovz', 'cmovne', 'cmovnz', 'cmova', 'cmovnbe', 'cmovae', 'cmovnb', 'cmovb', | |
'cmovnae', 'cmovbe', 'cmovna', 'cmovg', | |
'cmovnle', 'cmovge', 'cmovnl', 'cmovl', 'cmovnge', 'cmovle', 'cmovng', | |
'cmovc', 'cmovnc', 'cmovo', 'cmovno', 'cmovs', 'cmovns', 'cmovp', 'cmovpe', | |
'cmovnp', 'cmovpo',], | |
#Unconditianl Data Transfer | |
'udt': ['mov', 'xchg', 'bswap', 'movsx', 'movzx', 'movlps', 'movqda', 'lock xchg'], | |
#Stack Data Transfer |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Change the parameters to whatever suits you | |
batch_size = 512 | |
epochs = 100 | |
labels = [0 for _ in benign_images] + [1 for _ in malicious_images] | |
model.fit(benign_images+malicious_images, labels, | |
batch_size = batch_size, | |
epochs = epochs, | |
validation_split = 0.25, | |
shuffle = True) |
NewerOlder