From 3dc0241a4dcf0b2f1b5b05c7cbf93214c717b919 Mon Sep 17 00:00:00 2001
From: telescopic <vigneshsuresh1775@gmail.com>
Date: Sun, 11 Oct 2020 00:13:01 +0530
Subject: [PATCH] Feat: Clean text for better processing

---
 question_generation_main.py | 63 ++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/question_generation_main.py b/question_generation_main.py
index 1cc3419..7d66d1c 100644
--- a/question_generation_main.py
+++ b/question_generation_main.py
@@ -4,28 +4,53 @@ generation modules
 '''
 from question_extraction import QuestionExtractor
 from incorrect_answer_generation import IncorrectAnswerGenerator
+import re
+from nltk import sent_tokenize
+
 
 class QuestionGeneration:
-	'''This class contains the method
-	to generate questions
-	'''
+    '''This class contains the method
+    to generate questions
+    '''
 
-	def __init__(self, num_questions, num_options):
-		self.num_questions = num_questions
-		self.num_options = num_options
-		self.question_extractor = QuestionExtractor(num_questions)
+    def __init__(self, num_questions, num_options):
+        self.num_questions = num_questions
+        self.num_options = num_options
+        self.question_extractor = QuestionExtractor(num_questions)
 
-	def generate_questions_dict(self, document):
-		self.questions_dict = self.question_extractor.get_questions_dict(document)
-		self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
+    def clean_text(self, text):
+        text = text.replace('\n', ' ')  # remove newline chars
+        sentences = sent_tokenize(text)
+        cleaned_text = ""
+        for sentence in sentences:
+            # remove non alphanumeric chars
+            cleaned_sentence = re.sub(r'([^\s\w]|_)+', '', sentence)
 
-		for i in range(1, self.num_questions+1):
-			if i not in self.questions_dict:
-				continue
-			self.questions_dict[i]["options"] \
-				= self.incorrect_answer_generator.get_all_options_dict(
-						self.questions_dict[i]["answer"],
-						self.num_options
-						)
+            # substitute multiple spaces with single space
+            cleaned_sentence = re.sub(' +', ' ', cleaned_sentence)
+            cleaned_text += cleaned_sentence
 
-		return self.questions_dict
+            if cleaned_text[-1] == ' ':
+                cleaned_text[-1] = '.'
+            else:
+                cleaned_text += '.'
+
+            cleaned_text += ' '  # pad with space at end
+        return cleaned_text
+
+    def generate_questions_dict(self, document):
+        document = self.clean_text(document)
+        self.questions_dict = self.question_extractor.get_questions_dict(
+            document)
+        self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
+
+        for i in range(1, self.num_questions + 1):
+            if i not in self.questions_dict:
+                continue
+            self.questions_dict[i]["options"] \
+                = self.incorrect_answer_generator.get_all_options_dict(
+                self.questions_dict[i]["answer"],
+                self.num_options
+            )
+
+        return self.questions_dict