From 3dc0241a4dcf0b2f1b5b05c7cbf93214c717b919 Mon Sep 17 00:00:00 2001 From: telescopic Date: Sun, 11 Oct 2020 00:13:01 +0530 Subject: [PATCH] Feat: Clean text for better processing --- question_generation_main.py | 63 ++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/question_generation_main.py b/question_generation_main.py index 1cc3419..7d66d1c 100644 --- a/question_generation_main.py +++ b/question_generation_main.py @@ -4,28 +4,53 @@ generation modules ''' from question_extraction import QuestionExtractor from incorrect_answer_generation import IncorrectAnswerGenerator +import re +from nltk import sent_tokenize + class QuestionGeneration: - '''This class contains the method - to generate questions - ''' + '''This class contains the method + to generate questions + ''' - def __init__(self, num_questions, num_options): - self.num_questions = num_questions - self.num_options = num_options - self.question_extractor = QuestionExtractor(num_questions) + def __init__(self, num_questions, num_options): + self.num_questions = num_questions + self.num_options = num_options + self.question_extractor = QuestionExtractor(num_questions) - def generate_questions_dict(self, document): - self.questions_dict = self.question_extractor.get_questions_dict(document) - self.incorrect_answer_generator = IncorrectAnswerGenerator(document) + def clean_text(self, text): + text = text.replace('\n', ' ') # remove newline chars + sentences = sent_tokenize(text) + cleaned_text = "" + for sentence in sentences: + # remove non alphanumeric chars + cleaned_sentence = re.sub(r'([^\s\w]|_)+', '', sentence) - for i in range(1, self.num_questions+1): - if i not in self.questions_dict: - continue - self.questions_dict[i]["options"] \ - = self.incorrect_answer_generator.get_all_options_dict( - self.questions_dict[i]["answer"], - self.num_options - ) + # substitute multiple spaces with single space + cleaned_sentence = re.sub(' +', ' ', cleaned_sentence) + cleaned_text += cleaned_sentence - return self.questions_dict + if cleaned_text[-1] == ' ': + cleaned_text[-1] = '.' + else: + cleaned_text += '.' + + cleaned_text += ' ' # pad with space at end + return cleaned_text + + def generate_questions_dict(self, document): + document = self.clean_text(document) + self.questions_dict = self.question_extractor.get_questions_dict( + document) + self.incorrect_answer_generator = IncorrectAnswerGenerator(document) + + for i in range(1, self.num_questions + 1): + if i not in self.questions_dict: + continue + self.questions_dict[i]["options"] \ + = self.incorrect_answer_generator.get_all_options_dict( + self.questions_dict[i]["answer"], + self.num_options + ) + + return self.questions_dict