diff --git a/capectracer/capec_specs_dict.py b/capectracer/capec_specs_dict.py index 12aedcd32cc9e237ee3d814b48a13ee82007b41a..c3c414b49e733eba9e0f2b14fbb0cb2b71960026 100644 --- a/capectracer/capec_specs_dict.py +++ b/capectracer/capec_specs_dict.py @@ -1,72 +1,25 @@ -import xmltodict -import json import re +import xmltodict from tokens_dict import TokensDict -from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS, Phraser - +# Define the CapecSpecsDict class, inheriting from TokensDict class CapecSpecsDict(TokensDict): - def __init__(self, capec_file, training_corpus_file): + def __init__(self, capec_file, spec_file): super().__init__() - - self.__capec_names = [] - self.__capec_descriptions = [] - self.__capec_execution_flows = [] - self.__capec_mitigations = [] - self.__capec_tokens = [] - self.__capec_tokens_lemm = [] - self.__system_spec_tokens = [] - self.__system_spec_tokens_lemm = [] - self.__training_tokens = [] - # self.__bigram_model = None - # self.__trigram_model = None - - self.parse_capecs(capec_file) - self.parse_training_corpus(training_corpus_file) - - def get_capec_names(self): - return self.__capec_names - - def get_capec_descs(self): - return self.__capec_descriptions - - def get_capec_execution_flows(self): - return self.__capec_execution_flows - - def get_capec_mitigations(self): - return self.__capec_mitigations - - def get_capec_tokens(self): - return self.__capec_tokens - - def get_capec_tokens_lemm(self): - return self.__capec_tokens_lemm - - def get_system_spec_tokens(self): - return self.__system_spec_tokens - - def get_system_spec_tokens_lemm(self): - return self.__system_spec_tokens_lemm - - def get_training_tokens(self): - return self.__training_tokens - - def get_training_word_doc_count(self): - word_document_count = {} # Dictionary to store word-document counts - - # Iterate over each document in the corpus - for document in self.__training_tokens: - unique_words_in_doc = set(document) - # Update the word-document count for each unique word in the document - for word in unique_words_in_doc: - # Increment the count for the word if it exists, otherwise initialize it to 1 - word_document_count[word] = word_document_count.get(word, 0) + 1 + self.capec_names = [] + self.capec_descriptions = [] + self.capec_execution_flows = [] + self.capec_mitigations = [] + self.capec_tokens = [] + self.capec_sentences = [] - sorted_counts = sorted(word_document_count.items(), key=lambda x: (-x[1], x[0])) + self.system_spec_tokens = [] + self.system_spec_sentences = [] - return sorted_counts + self.parse_capecs(capec_file) + self.parse_system_specs(spec_file) def parse_capecs(self, capec_file): capec_descriptions = [] @@ -83,33 +36,36 @@ class CapecSpecsDict(TokensDict): attack_patterns = capec_dict['Attack_Pattern_Catalog']['Attack_Patterns']['Attack_Pattern'] for attack_pattern in attack_patterns: - if (attack_pattern['@Abstraction'] == 'Standard' or \ - attack_pattern['@Abstraction'] == 'Detailed') and \ + if attack_pattern['@Abstraction'] == 'Detailed' and \ attack_pattern['@Status'] != 'Obsolete' and \ attack_pattern['@Status'] !='Deprecated' and \ - "Execution_Flow" in attack_pattern: + "Description" in attack_pattern and \ + attack_pattern["Description"] != None: capec_names.append(attack_pattern['@Name']) - description = "" description = attack_pattern["Description"] if "Extended_Description" in attack_pattern: description = description + " " + attack_pattern["Extended_Description"] - - capec_descriptions.append(description) + + cleaned_desc = super().clean_text(description) + capec_descriptions.append(cleaned_desc) attack_steps_list = [] - attack_steps = attack_pattern["Execution_Flow"]["Attack_Step"] + + if "Execution_Flow" in attack_pattern: + attack_steps = attack_pattern["Execution_Flow"]["Attack_Step"] - if isinstance(attack_steps, list): - for idx, attack_step in enumerate(attack_steps): - cleaned_attack_step = "Step " + str(idx + 1) + ". " + \ - super().clean_text(attack_step["Description"]) + if isinstance(attack_steps, list): + for idx, attack_step in enumerate(attack_steps): + attack_stp = "Step " + str(idx + 1) + ". " + attack_step["Description"] + "." + cleaned_attack_step = super().clean_text(attack_stp) + attack_steps_list.append(cleaned_attack_step) + else: + attack_stp = "Step 1. " + attack_steps["Description"] + "." + cleaned_attack_step = super().clean_text(attack_stp) attack_steps_list.append(cleaned_attack_step) - else: - cleaned_attack_step = "Step 1. " + super().clean_text(attack_steps["Description"]) - attack_steps_list.append(cleaned_attack_step) capec_execution_flows.append(attack_steps_list) @@ -128,94 +84,34 @@ class CapecSpecsDict(TokensDict): capec_mitigations.append(mitigations_list) - capec_descriptions = super().clean_texts(capec_descriptions) - - self.__capec_names = capec_names - self.__capec_descriptions = capec_descriptions - self.__capec_execution_flows = capec_execution_flows - self.__capec_mitigations = capec_mitigations - - capec_tokens = super().tokenize_text(capec_descriptions) - capec_tokens = super().lowercase_tokens(capec_tokens) - capec_tokens = super().remove_numbers(capec_tokens) - - capec_tokens_lemm = super().lemmatize_tokens(capec_tokens) - - self.__capec_tokens = capec_tokens - self.__capec_tokens_lemm = capec_tokens_lemm - - # bigram = Phrases(training_tokens, min_count=7, threshold=18, - # connector_words=ENGLISH_CONNECTOR_WORDS) - # self.__bigram_model = Phraser(bigram) - - # trigram = Phrases(bigram[training_tokens], min_count=16, threshold=26, - # connector_words=ENGLISH_CONNECTOR_WORDS) - # self.__trigram_model = Phraser(trigram) - - # # Apply the bigram model to each document in the training tokens corpus - # training_tokens = [self.__bigram_model[doc] for doc in training_tokens] - # # Apply the trigram model to the resulting bigram training tokens corpus - # training_tokens = [self.__trigram_model[doc] for doc in training_tokens] - - def parse_training_corpus(self, training_corpus_file): - unduped_training_tokens = [] + self.capec_names = capec_names + self.capec_descriptions = capec_descriptions + self.capec_execution_flows = capec_execution_flows + self.capec_mitigations = capec_mitigations - # Open the JSON file - with open(training_corpus_file, 'r') as file: - # Load the JSON data - data = json.load(file) + capec_data = [] - for source in ["MS-Bulletin", "Metasploit", "NVD"]: - for _, value in data[source].items(): - corpora_tokens = [] + for capec_index, capec_description in enumerate(self.capec_descriptions): + capec_desc_ef = capec_description - for word_label in value: - word = word_label[0] + for attack_step in self.capec_execution_flows[capec_index]: + attack_stp = re.sub(r'^Step [0-9]+\. ','', attack_step) + attack_stp = re.sub(r'^\[.*\] ','', attack_stp) + capec_desc_ef = capec_desc_ef + " " + attack_stp - if word.isalnum(): - corpora_tokens.append(word) + capec_data.append(capec_desc_ef) - unduped_training_tokens.append(corpora_tokens) + capec_sentences, capec_tokens = super().preprocess(capec_data) - unduped_training_tokens = super().lowercase_tokens(unduped_training_tokens) - unduped_training_tokens = super().remove_numbers(unduped_training_tokens) - - unduped_training_tokens += self.__capec_tokens - - training_tokens_dict = {} - - # Iterate through each array in the outer array - for index, token_set in enumerate(unduped_training_tokens): - # Sort the elements of the array to ignore the order - sorted_token_set = sorted(token_set) - - # Convert the sorted array to a tuple to make it hashable - token_set_tuple = tuple(sorted_token_set) - - # Compute the hash value of the sorted array - token_set_hash = hash(token_set_tuple) - - # Check if the hash value already exists in the dictionary - if token_set_hash not in training_tokens_dict: - # If the hash value doesn't exist, add the sorted array to the dictionary - training_tokens_dict[token_set_hash] = index - - training_tokens = [unduped_training_tokens[index] for index in list(training_tokens_dict.values())] - - self.__training_tokens = training_tokens + self.capec_tokens = capec_tokens + self.capec_sentences = capec_sentences def parse_system_specs(self, spec_file): with open(spec_file, 'r') as file: - # Read the contents of the file file_contents = file.read() - system_spec_texts = super().clean_texts([file_contents]) - - system_spec_tokens = super().tokenize_text(system_spec_texts) - system_spec_tokens = super().lowercase_tokens(system_spec_tokens) - system_spec_tokens = super().remove_numbers(system_spec_tokens) - - system_spec_tokens_lemm = super().lemmatize_tokens(system_spec_tokens) + system_spec_texts = [super().clean_text(file_contents)] + system_spec_sentences, system_spec_tokens = super().preprocess(system_spec_texts) - self.__system_spec_tokens = system_spec_tokens[0] - self.__system_spec_tokens_lemm = system_spec_tokens_lemm[0] \ No newline at end of file + self.system_spec_tokens = system_spec_tokens[0] + self.system_spec_sentences = system_spec_sentences[0] diff --git a/capectracer/capec_tracer.py b/capectracer/capec_tracer.py index e3f862936a3d64e86ab6d2d5cd7b85000b2821a9..2670d5e84aebdefaf15b387c2b72b7935764db79 100644 --- a/capectracer/capec_tracer.py +++ b/capectracer/capec_tracer.py @@ -1,14 +1,14 @@ import requests import os +import torch +import decimal from capec_specs_dict import CapecSpecsDict -from gensim.corpora import Dictionary -from gensim.models import LdaModel -from gensim.matutils import hellinger from gensim import similarities +from gensim.corpora.dictionary import Dictionary from gensim.models import TfidfModel -from sklearn.utils import shuffle +from sentence_transformers import SentenceTransformer, util def get_capec_file(abs_path): # Send a GET request to the URL @@ -22,36 +22,48 @@ def get_capec_file(abs_path): return response +def round_dec(x, place, round_up): + context = decimal.getcontext() + original_rounding = context.rounding + + if round_up == True: + context.rounding = decimal.ROUND_CEILING + else: + context.rounding = decimal.ROUND_FLOOR + + rounded = round(decimal.Decimal(str(x)), place) + context.rounding = original_rounding + + return float(rounded) + def trace_capecs(abs_path): try: - tokens_dicti = CapecSpecsDict(abs_path + "capec_latest.xml", abs_path + 'full_corpus.json') + dicti = CapecSpecsDict(abs_path + "capec_latest.xml", abs_path + "system_specs.txt") - capec_names = tokens_dicti.get_capec_names() - capec_descs = tokens_dicti.get_capec_descs() - capec_attack_steps = tokens_dicti.get_capec_execution_flows() - capec_mitigations = tokens_dicti.get_capec_mitigations() - capec_tokens = tokens_dicti.get_capec_tokens() - capec_tokens_lemm = tokens_dicti.get_capec_tokens_lemm() - - tokens_dicti.parse_system_specs(abs_path + "system_specs.txt") - spec_tokens = tokens_dicti.get_system_spec_tokens() - spec_tokens_lemm = tokens_dicti.get_system_spec_tokens_lemm() + capec_names = dicti.capec_names + capec_descs = dicti.capec_descriptions + capec_attack_steps = dicti.capec_execution_flows + capec_mitigations = dicti.capec_mitigations + all_capec_sentences = dicti.capec_sentences + capec_tokens = dicti.capec_tokens + spec_sentences = dicti.system_spec_sentences + spec_tokens = dicti.system_spec_tokens # Create a dictionary from the corpus documents - capec_lemm_dictionary = Dictionary(capec_tokens_lemm) + capec_dictionary = Dictionary(capec_tokens) # Create a bag-of-words corpus from the corpus documents - capec_lemm_corpus = [capec_lemm_dictionary.doc2bow(doc) for doc in capec_tokens_lemm] + capec_corpus = [capec_dictionary.doc2bow(doc) for doc in capec_tokens] # Create a TF-IDF model - tfidf_model = TfidfModel(capec_lemm_corpus, id2word=capec_lemm_dictionary) + tfidf_model = TfidfModel(capec_corpus, id2word=capec_dictionary) # Convert the checked document to TF-IDF representation - spec_lemm_bow = capec_lemm_dictionary.doc2bow(spec_tokens_lemm) - spec_tfidf = tfidf_model[spec_lemm_bow] + spec_bow = capec_dictionary.doc2bow(spec_tokens) + spec_tfidf = tfidf_model[spec_bow] # Create a similarity index - index = similarities.MatrixSimilarity(tfidf_model[capec_lemm_corpus], num_features=len(capec_lemm_dictionary)) + index = similarities.MatrixSimilarity(tfidf_model[capec_corpus], num_features=len(capec_dictionary)) # Calculate similarity scores similarity_scores = index[spec_tfidf] @@ -67,57 +79,51 @@ def trace_capecs(abs_path): # print(score) # print(capec_descs[idx]) - if sorted_doc_similarity_pairs[1][1] > 0: - training_tokens = tokens_dicti.get_training_tokens() - training_tokens = shuffle(training_tokens) - - training_dictionary = Dictionary(training_tokens) - training_dictionary.filter_extremes(no_below=2, no_above=0.3) - - training_corpus = [training_dictionary.doc2bow(text) for text in training_tokens] - - lda = LdaModel(corpus=training_corpus, - num_topics=18, - id2word=training_dictionary, - passes=12, - alpha="auto", - eta="auto", - iterations=6, - update_every=1, - decay=0.745786579640169, - offset=3.20092862358715 - ) - - system_specs_bow = training_dictionary.doc2bow(spec_tokens) - system_spec_topic_dist = lda[system_specs_bow] - - difference_scores = [] + if sorted_doc_similarity_pairs[0][1] > .16: + model = SentenceTransformer('basel/ATTACK-BERT') - for index, capec_token_set in enumerate(capec_tokens): - capec_desc_bow = training_dictionary.doc2bow(capec_token_set) - capec_topic_dist = lda[capec_desc_bow] + spec_embeddings = model.encode(spec_sentences, convert_to_tensor=True) - score = [index, hellinger(capec_topic_dist, system_spec_topic_dist)] + difference_scores = [] + + for index, capec_sentences in enumerate(all_capec_sentences): + capec_embeddings = model.encode(capec_sentences, convert_to_tensor=True) + cos_scores = util.cos_sim(spec_embeddings, capec_embeddings) + mean_cos_score = torch.mean(cos_scores).item() + score = [index, mean_cos_score] difference_scores.append(score) # Sort the arrays based on the value of the second index in each array - difference_scores = sorted(difference_scores, key=lambda x: x[1], reverse=False) + difference_scores = sorted(difference_scores, key=lambda x: x[1], reverse=True) + max_score = round_dec(difference_scores[0][1], 2, True) + min_score = min(score[1] for score in difference_scores if score[1] > 0) + min_score = round_dec(min_score, 2, False) with open(abs_path + 'traced_capecs.txt', 'w') as output: for score in difference_scores: - output.write(f'Confidence score: {100 - int(100 * score[1])}%\n') - output.write(f'Name: {capec_names[score[0]]}\n') - output.write(f'Description:\n') - output.write(f'{capec_descs[score[0]]}\n') - output.write(f'Attack Steps:\n') - for attack_step in capec_attack_steps[score[0]]: - output.write(f'{attack_step}\n') - if capec_mitigations[score[0]]: - output.write(f"Mitigations:\n") - for mitigation in capec_mitigations[score[0]]: - output.write(f'{mitigation}\n') - output.write('\n') + if score[1] > 0: + normalized_score = (score[1] - min_score) / (max_score - min_score) + confidence_score = int(100 * normalized_score) + + if confidence_score > 0: + output.write(f'Confidence score: {confidence_score}%\n') + output.write(f'Name: {capec_names[score[0]]}\n') + output.write(f'Description:\n') + output.write(f'{capec_descs[score[0]]}\n') + + if capec_attack_steps[score[0]]: + output.write(f'Attack Steps:\n') + + for attack_step in capec_attack_steps[score[0]]: + output.write(f'{attack_step}\n') + + if capec_mitigations[score[0]]: + output.write(f"Mitigations:\n") + + for mitigation in capec_mitigations[score[0]]: + output.write(f'{mitigation}\n') + output.write('\n') else: with open(abs_path + 'traced_capecs.txt', 'w') as output: output.write("No attack patterns were able to be identified with the provided system specifications. " + @@ -138,3 +144,5 @@ if __name__ == "__main__": else: with open(abs_path + 'traced_capecs.txt', 'w') as output: output.write(f"Failed to download the list of CAPECs from MITRE. Status code: {response.status_code}.") + + print("Results (or errors if any were encountered) have been published to traced_capecs.txt.") diff --git a/capectracer/full_corpus.json b/capectracer/full_corpus.json deleted file mode 100755 index b7c038819c6f10f173b44b6a01e3da0c5c8b1086..0000000000000000000000000000000000000000 Binary files a/capectracer/full_corpus.json and /dev/null differ diff --git a/capectracer/requirements_capec_tracer.txt b/capectracer/requirements_capec_tracer.txt index 4e44f6d8edd320faa0753aec8e287bb940ec58c8..5ddadcd67395243087b307c71fa6dfbfe8148ebd 100644 --- a/capectracer/requirements_capec_tracer.txt +++ b/capectracer/requirements_capec_tracer.txt @@ -1,7 +1,7 @@ -scikit-learn==1.2 gensim==4.1.2 spacy==3.7.2 xmltodict==0.13.0 nltk==3.7 -scipy==1.10.1 requests==2.25.1 +torch==2.3.0 +sentence_transformers==2.7.0 \ No newline at end of file diff --git a/capectracer/requirements_model_training.txt b/capectracer/requirements_model_training.txt deleted file mode 100644 index 9a9fd05fbada70484ea69ad1979dd039d47fb895..0000000000000000000000000000000000000000 --- a/capectracer/requirements_model_training.txt +++ /dev/null @@ -1,4 +0,0 @@ -tqdm==4.64.0 -numpy==1.22.4 -bayesian-optimization==1.4.3 -pandas==1.4.2 diff --git a/capectracer/tokens_dict.py b/capectracer/tokens_dict.py index a2aa8ebeda84f2b7eca0d2260cea8d0ac86dd2aa..52204b6b531179eaa80ded364b399f9b3c6032e1 100644 --- a/capectracer/tokens_dict.py +++ b/capectracer/tokens_dict.py @@ -1,9 +1,10 @@ import re import nltk +import spacy from nltk.stem import WordNetLemmatizer -from nltk.tokenize import word_tokenize -import spacy +from nltk.tokenize import word_tokenize, sent_tokenize +from nltk.corpus import wordnet as wn class TokensDict: def __init__(self): @@ -13,71 +14,73 @@ class TokensDict: nltk.download('averaged_perceptron_tagger') nltk.download('omw-1.4') - def tokenize_text(self, texts): - tokens_list = [] - - for text in texts: + self.en = spacy.load('en_core_web_sm', enable=[""]) + self.stop_words = {} + self.lemmatizer = WordNetLemmatizer() + + for stop_word in self.en.Defaults.stop_words: + self.stop_words[hash(stop_word)] = stop_word + + wn.ensure_loaded() + + def lemm_and_rem_sw(self, tokens): + preprocessed_tokens = [] + tagged_tokens = nltk.pos_tag(tokens) + + for token, pos in tagged_tokens: + hashed_token = hash(token) + + if not token.isnumeric() and self.stop_words.get(hashed_token) == None: + if pos.startswith("N"): + preprocessed_tokens.append(self.lemmatizer.lemmatize(token, "n")) + elif pos.startswith("V"): + preprocessed_tokens.append(self.lemmatizer.lemmatize(token, "v")) + elif pos.startswith('J'): + preprocessed_tokens.append(self.lemmatizer.lemmatize(token, "a")) + elif pos.startswith('R'): + preprocessed_tokens.append(self.lemmatizer.lemmatize(token, "r")) + else: + preprocessed_tokens.append(token) + + return preprocessed_tokens + + def tokenize_text(self, text): + sentences = sent_tokenize(text) + prepro_sentences = [] + + for sentence in sentences: # Remove punctuation using regular expression - prepro_text = re.sub(r'[^a-zA-Z0-9\s/-]', '', text) + prepro_text = re.sub(r'[^a-zA-Z0-9\s/-]', '', sentence) prepro_text = re.sub(r'[-/]', ' ', prepro_text) - # Tokenization using NLTK - tokens = word_tokenize(prepro_text) + prepro_text = re.sub(r'\s+', ' ', prepro_text) + prepro_text = prepro_text.strip() + prepro_sentences.append(prepro_text) - tokens_list.append(tokens) - - return tokens_list - - def remove_numbers(self, tokens_list): - filtered_tokens_list = [[token for token in token_set if not token.isnumeric()] \ - for token_set in tokens_list] - - return filtered_tokens_list - - def lemmatize_tokens(self, tokens_list): - en = spacy.load('en_core_web_sm') - stop_words = en.Defaults.stop_words - - # Initialize WordNet Lemmatizer - lemmatizer = WordNetLemmatizer() - lemmatized_token_list = [] - - for token_set in tokens_list: - lemmatized_tokens = [] - tagged_tokens = nltk.pos_tag(token_set) - - for token, pos in tagged_tokens: - if token not in stop_words: - if pos.startswith("N"): - lemmatized_tokens.append(lemmatizer.lemmatize(token, "n")) - elif pos.startswith("V"): - lemmatized_tokens.append(lemmatizer.lemmatize(token, "v")) - elif pos.startswith('J'): - lemmatized_tokens.append(lemmatizer.lemmatize(token, "a")) - elif pos.startswith('R'): - lemmatized_tokens.append(lemmatizer.lemmatize(token, "r")) - else: - lemmatized_tokens.append(token) - - lemmatized_token_list.append(lemmatized_tokens) + word_tokens = ' '.join(prepro_sentences) + word_tokens = word_tokens.lower() + word_tokens = word_tokenize(word_tokens) - return lemmatized_token_list - - def lowercase_tokens(self, tokens_list): - return [[token.lower() for token in token_set] for token_set in tokens_list] + # Tokenization using NLTK + preprocessed_tokens = self.lemm_and_rem_sw(word_tokens) + + return prepro_sentences, preprocessed_tokens + + def preprocess(self, corpus): + prepro_corpus_sentences = [] + prepro_corpus_tokens = [] + + for doc in corpus: + prepro_sentences, prepro_tokens = self.tokenize_text(doc) + prepro_corpus_sentences.append(prepro_sentences) + prepro_corpus_tokens.append(prepro_tokens) + + return prepro_corpus_sentences, prepro_corpus_tokens def clean_text(self, text): - cleaned_text = re.sub(r'\n', '', text) + cleaned_text = text.strip() + cleaned_text = re.sub(r'\n', ' ', cleaned_text) + cleaned_text = re.sub(r'\.+', '.', cleaned_text) cleaned_text = re.sub(r'\s+', ' ', cleaned_text) cleaned_text = re.sub(r'^\[[^\]]*\]\s*', '', cleaned_text) return cleaned_text - - def clean_texts(self, texts): - cleaned_texts = [] - - for text in texts: - cleaned_text = self.clean_text(text) - - cleaned_texts.append(cleaned_text) - - return cleaned_texts \ No newline at end of file diff --git a/capectracer/training_lda_model.py b/capectracer/training_lda_model.py deleted file mode 100644 index 29e4d95b0e011463948bd04e5885d7b417d53e13..0000000000000000000000000000000000000000 --- a/capectracer/training_lda_model.py +++ /dev/null @@ -1,206 +0,0 @@ -import tqdm -import logging -import numpy as np -import os.path -import requests - -from capec_specs_dict import CapecSpecsDict - -from gensim.corpora import Dictionary -from gensim.models import LdaModel -from gensim.models.callbacks import CoherenceMetric, PerplexityMetric, ConvergenceMetric -from bayes_opt import BayesianOptimization -from pandas import DataFrame -from bayes_opt.logger import JSONLogger -from bayes_opt.event import Events -from sklearn.model_selection import KFold -from functools import partial - -# Train model -def train_lda(training_tokens, dictionary, num_topics, passes, iterations, update_every, - training_logs, training_results_csv, eval_every, decay, offset): - model_dict = {'Topics': [], - 'Passes': [], - 'Iterations': [], - 'Update_Every': [], - 'Decays': [], - 'Offsets': [], - 'Avg_Coherences': [], - 'Avg_Perplexities': [], - 'Avg_Convergences': [], - 'Dev_Coherences': [], - 'Dev_Perplexities': [], - 'Dev_Convergences': [], - 'Epochs': []} - - # Initialize k-fold cross-validation - k = 4 - kf = KFold(n_splits=k, shuffle=True, random_state=1) - - # Initialize a list to store evaluation metrics for each fold - coherence_scores = [] - perplexity_scores = [] - convergence_scores = [] - - k_index = 1 - - print("Training with", num_topics, "topics,", passes, - "max passes,", iterations, "max iterations,", - update_every, "update every,", decay, "decay,", offset, "offset:", sep=" ") - - pbar = tqdm.trange(k) - - # Iterate over each fold - for train_index, test_index in kf.split(training_tokens): - train_tokens = [training_tokens[i] for i in train_index] - val_tokens = [training_tokens[i] for i in test_index] - - train_corpus = [dictionary.doc2bow(doc) for doc in train_tokens] - val_corpus = [dictionary.doc2bow(doc) for doc in val_tokens] - - # Configure logging - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG, - filename=training_logs) - - logging.info( - "Training with %s topics, %s max passes, %s max iterations, %s update every, %s decay, %s offset at %s fold:", - num_topics, passes, iterations, update_every, decay, offset, k_index) - - coherence_metric = CoherenceMetric( - texts=train_tokens, coherence='c_npmi', window_size=10) - perplexity_metric = PerplexityMetric(corpus=val_corpus) - convergence_metric = ConvergenceMetric(distance="hellinger") - - lda_model = LdaModel(corpus=train_corpus, - num_topics=num_topics, - id2word=dictionary, - passes=passes, - alpha="auto", - eta="auto", - random_state=1, - iterations=iterations, - update_every=update_every, - decay=decay, - offset=offset, - eval_every=eval_every, - callbacks=[perplexity_metric, coherence_metric, convergence_metric], - chunksize=2000) - - logging.shutdown() - - coherence_scores.append(lda_model.metrics['Coherence']) - perplexity_scores.append(lda_model.metrics['Perplexity']) - convergence_scores.append(lda_model.metrics['Convergence']) - - k_index += 1 - - pbar.update(1) - - pbar.close() - - for epoch, _ in enumerate(coherence_scores[0]): - model_dict['Topics'].append(num_topics) - model_dict['Passes'].append(passes) - model_dict['Iterations'].append(iterations) - model_dict['Update_Every'].append(update_every) - model_dict['Decays'].append(decay) - model_dict['Offsets'].append(offset) - model_dict['Epochs'].append(epoch + 1) - - coherence_score_averages = np.mean(coherence_scores, axis=0) - perplexity_score_averages = np.mean(perplexity_scores, axis=0) - convergence_score_averages = np.mean(convergence_scores, axis=0) - - # Compute the absolute deviation of each element from its corresponding column mean - coherence_absolute_deviations = np.abs(coherence_scores - coherence_score_averages) - perplexity_absolute_deviations = np.abs(perplexity_scores - perplexity_score_averages) - convergence_absolute_deviations = np.abs(convergence_scores - convergence_score_averages) - - # Compute the average of these absolute deviations for each column - coherece_absolute_average_deviations = np.mean(coherence_absolute_deviations, axis=0) - perplexity_absolute_average_deviations = np.mean(perplexity_absolute_deviations, axis=0) - convergence_absolute_average_deviations = np.mean(convergence_absolute_deviations, axis=0) - - model_dict['Avg_Coherences'] = coherence_score_averages - model_dict['Avg_Perplexities'] = perplexity_score_averages - model_dict['Avg_Convergences'] = convergence_score_averages - model_dict['Dev_Coherences'] = coherece_absolute_average_deviations - model_dict['Dev_Perplexities'] = perplexity_absolute_average_deviations - model_dict['Dev_Convergences'] = convergence_absolute_average_deviations - - model_df = DataFrame.from_dict(model_dict) - - if os.path.isfile(training_results_csv): - model_df.to_csv(training_results_csv, mode='a', header=False) - else: - model_df.to_csv(training_results_csv, mode='w', header=True) - - return coherence_score_averages[len(coherence_score_averages) - 1] - # return -perplexity_score_averages[len(perplexity_score_averages) - 1] - -def train_lda_wrapper(training_tokens, dictionary, num_topics, passes, iterations, update_every, - training_logs, training_results_csv, eval_every, decay=0.5, offset=1): - num_topics = int(num_topics) - passes = int(passes) - iterations = int(iterations) - update_every = int(update_every) - - return train_lda(training_tokens, dictionary, num_topics, passes, iterations, update_every, - training_logs, training_results_csv, eval_every, decay, offset) - -def get_capec_file(): - # Send a GET request to the URL - response = requests.get("https://capec.mitre.org/data/xml/capec_latest.xml") - - # Check if the request was successful (status code 200) - if response.status_code == 200: - # Open the file in binary write mode and write the content - with open("capec_latest.xml", 'wb') as file: - file.write(response.content) - - return response - -if __name__ == "__main__": - response = get_capec_file() - - if response.status_code == 200: - tokens_dicti = CapecSpecsDict("capec_latest.xml", 'full_corpus.json') - - training_tokens = tokens_dicti.get_training_tokens() - - # Create dictionary and train/test corpus - dictionary = Dictionary(training_tokens) - dictionary.filter_extremes(no_below=2, no_above=0.3) - - train_lda_partial = partial(train_lda_wrapper, - training_tokens=training_tokens, - dictionary=dictionary, - training_logs="lda_training4.log", - training_results_csv="lda_training_results4.csv", - eval_every=10 - ) - - # train_lda_partial(num_topics=181, - # passes=20, iterations=len(tokens_dicti.get_capec_names()), update_every=1, decay=0.7, offset=10) - - optimizer = BayesianOptimization( - f=train_lda_partial, - pbounds={'num_topics': (2, 30), - 'passes': (10, 20), - 'iterations': (1, 100), - 'update_every': (1, 1), - 'decay': (0.5, 1), - 'offset': (1, 16)}, - verbose=2, - random_state=1, - ) - - optimizer.set_gp_params(alpha=1e-3) - - logger = JSONLogger(path="./lda_optimization4_log") - optimizer.subscribe(Events.OPTIMIZATION_STEP, logger) - - optimizer.maximize(init_points=10, n_iter=50) - print(optimizer.max) - else: - print(f"Failed to download the list of CAPECs from MITRE. Status code: {response.status_code}.") \ No newline at end of file diff --git a/src/main/java/ai/AIAttackPatternTree3.java b/src/main/java/ai/AIAttackPatternTree3.java index 254981413e76ea5554ca5e778fd74a81c0e50a7f..c1d714bc90d732973b5e98fbad3233152871bcfc 100644 --- a/src/main/java/ai/AIAttackPatternTree3.java +++ b/src/main/java/ai/AIAttackPatternTree3.java @@ -29,9 +29,9 @@ public class AIAttackPatternTree3 extends AIInteract { "specification in \"description\". " + "# Respect: All words in \"description\" must be separated with spaces."; - private static final String KNOWLEDGE_ON_JSON_FOR_ATTACK_SCEN = "When you are asked to identify all the possible attack scenarios " + - "that an attacker needs to complete or could possibly perform to successfully achieve an attack, " + - "return them as a JSON specification formatted as follows: " + + private static final String KNOWLEDGE_ON_JSON_FOR_ATTACK_SCEN = "When you are asked to identify all the possible " + + "attack scenarios that an attacker would perform to successfully achieve an " + + "attack, return them as a JSON specification formatted as follows: " + "{\"attack\": \"NameOfAttack\", \"attackscenarios\": [{\"name\": \"NameOfAttackScenario\", \"description\": \"" + "The description of the attack scenario and how it brings an attacker closer to the attack.\", " + "\"operator\": \"OR or AND\"} ...]} " + @@ -45,7 +45,7 @@ public class AIAttackPatternTree3 extends AIInteract { "# Respect: \"operator\" must be only \"AND\" or \"OR\". Use \"AND\" to denote that an attacker must " + "complete all of the attack scenarios with an AND operator simultaneously to get " + "closer to the attack. Use \"OR\" to denote that an attacker only needs to complete one of the attack " + - "scenarios amongst all of the attack scenarios with an OR operator to get closer to the attack." + + "scenarios amongst all of the attack scenarios with an OR operator to get closer to the attack. " + "# Respect: The number of attack scenarios with the AND operator should be either zero, or greater than one. " + "# Respect: The number of attack scenarios with the OR operator should be either zero, or greater than one. " + "# Respect: If there are no attack scenarios that are able to be identified, have \"attackscenarios\" be " + @@ -63,7 +63,7 @@ public class AIAttackPatternTree3 extends AIInteract { "# Respect: Include what the attack step is and how it needs to be completed by an attacker for " + "the attack scenario in \"description\". " + "# Respect: All words in \"description\" must be separated with spaces. " + - "# Respect: The elements of \"attacksteps\" need to be ordered sequentially. That is, the first indexed element" + + "# Respect: The elements of \"attacksteps\" need to be ordered sequentially. That is, the first indexed element " + "in \"attacksteps\" is the step that an attacker needs to complete first while the last indexed element is the step " + "that an attacker needs to complete last. " + "# Respect: There must be at least two attack steps in \"attacksteps\"."; @@ -102,15 +102,17 @@ public class AIAttackPatternTree3 extends AIInteract { private final String[] QUESTION_IDENTIFY_ATD = {"From the provided system specification " + "and using the specified JSON format, identify a possible objective that an attacker would " + - "want to and could feasibly achieve from exploiting the system using the list of provided attack " + - "patterns. Do respect the JSON format, and " + - "provide only JSON (no explanation before or after).\n", - - "Using the list of attack patterns (if provided) and the specified JSON format, " + - "identify all of the attack scenarios that an attacker can perform or needs to perform " + - "simultaneously to achieve the provided attack. If provided with a system specification, " + - "make sure to associate the specification with the attack scenarios. " + - "Do respect the JSON format, and provide only JSON (no explanation before or after).\n", + "want to and could feasibly achieve from exploiting the system. If provided with a list of attack " + + "patterns, use at least one attack pattern from this list to identify a possible objective. " + + "Do respect the JSON format, and provide only JSON (no explanation before or after).\n", + + "Using the specified JSON format, " + + "identify all of the attack scenarios that an attacker would perform, " + + "either simultaneously or separately, to achieve the provided attack. If provided with a " + + "system specification, make sure to associate the specification with the attack scenarios. " + + "In addition, if provided with a list of attack patterns, use at least one attack pattern " + + "from this list to identify the attack scenarios. Do respect the JSON format, and provide " + + "only JSON (no explanation before or after).\n", "Identify all of the attack steps that an attacker needs to conduct to " + "achieve the provided attack scenario. Do respect the JSON format, and " + @@ -181,7 +183,7 @@ public class AIAttackPatternTree3 extends AIInteract { chatData.aiinterface.addKnowledge("The system specification is: " + _spec, "ok"); } - if (_attackPatterns != null) { + if (_attackPatterns != null && _attackPatterns.length > 0) { StringBuilder builder = new StringBuilder(); for (int i = 0; i <= _attackPatterns.length - 1; i++) { @@ -197,7 +199,7 @@ public class AIAttackPatternTree3 extends AIInteract { } TraceManager.addDev("\nKnowledge added: " + builder); - chatData.aiinterface.addKnowledge("The attack pattern is: " + builder, "ok"); + chatData.aiinterface.addKnowledge("The attack patterns are: " + builder, "ok"); } if (previousRootAtts != null && !previousRootAtts.isEmpty()) { @@ -446,7 +448,7 @@ public class AIAttackPatternTree3 extends AIInteract { } private JSONArray checkAttackScenarios(String _spec, String attack, boolean isRoot, - Collection<String> _errors) throws org.json.JSONException { + Collection<String> _errors) throws org.json.JSONException { String attackName; if (isRoot) { diff --git a/src/main/java/ai/CAPECTracer.java b/src/main/java/ai/CAPECTracer.java index 931271fe85b5bd9fddd45bab63abe032ff307c15..bc76d56948e914ade45fa9ef9a3bf2a39d7c4407 100644 --- a/src/main/java/ai/CAPECTracer.java +++ b/src/main/java/ai/CAPECTracer.java @@ -89,9 +89,10 @@ public class CAPECTracer extends AIInteract { characterInt = data.read(); } - TraceManager.addDev(output.toString()); - byte[] bytes = Files.readAllBytes(Path.of(capecTracerFolder + "/traced_capecs.txt")); - traces = new String(bytes); + if (!output.toString().contains("Error")) { + byte[] bytes = Files.readAllBytes(Path.of(capecTracerFolder + "/traced_capecs.txt")); + traces = new String(bytes); + } } catch (IOException | LauncherException e) { TraceManager.addDev(e.getMessage()); }