Feat: Clean text for better processing

2020-10-11 00:13:01 +05:30 · 2020-10-11 00:13:01 +05:30 · 3dc0241a4d
commit 3dc0241a4d
parent 37af1c41fd
1 changed files with 44 additions and 19 deletions
--- a/question_generation_main.py
+++ b/question_generation_main.py
@ -4,6 +4,9 @@ generation modules
 '''
 from question_extraction import QuestionExtractor
 from incorrect_answer_generation import IncorrectAnswerGenerator
 import re
 from nltk import sent_tokenize
 class QuestionGeneration:
    '''This class contains the method
@ -15,11 +18,33 @@ class QuestionGeneration:
        self.num_options = num_options
        self.question_extractor = QuestionExtractor(num_questions)
    def clean_text(self, text):
        text = text.replace('\n', ' ')  # remove newline chars
        sentences = sent_tokenize(text)
        cleaned_text = ""
        for sentence in sentences:
            # remove non alphanumeric chars
            cleaned_sentence = re.sub(r'([^\s\w]|_)+', '', sentence)
            # substitute multiple spaces with single space
            cleaned_sentence = re.sub(' +', ' ', cleaned_sentence)
            cleaned_text += cleaned_sentence
            if cleaned_text[-1] == ' ':
                cleaned_text[-1] = '.'
            else:
                cleaned_text += '.'
            cleaned_text += ' '  # pad with space at end
        return cleaned_text
    def generate_questions_dict(self, document):
-		self.questions_dict = self.question_extractor.get_questions_dict(document)
+        document = self.clean_text(document)
        self.questions_dict = self.question_extractor.get_questions_dict(
            document)
        self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
-		for i in range(1, self.num_questions+1):
+        for i in range(1, self.num_questions + 1):
            if i not in self.questions_dict:
                continue
            self.questions_dict[i]["options"] \