Feat: Clean text for better processing

This commit is contained in:
telescopic 2020-10-11 00:13:01 +05:30
parent 37af1c41fd
commit 3dc0241a4d

View File

@ -4,28 +4,53 @@ generation modules
''' '''
from question_extraction import QuestionExtractor from question_extraction import QuestionExtractor
from incorrect_answer_generation import IncorrectAnswerGenerator from incorrect_answer_generation import IncorrectAnswerGenerator
import re
from nltk import sent_tokenize
class QuestionGeneration: class QuestionGeneration:
'''This class contains the method '''This class contains the method
to generate questions to generate questions
''' '''
def __init__(self, num_questions, num_options): def __init__(self, num_questions, num_options):
self.num_questions = num_questions self.num_questions = num_questions
self.num_options = num_options self.num_options = num_options
self.question_extractor = QuestionExtractor(num_questions) self.question_extractor = QuestionExtractor(num_questions)
def generate_questions_dict(self, document): def clean_text(self, text):
self.questions_dict = self.question_extractor.get_questions_dict(document) text = text.replace('\n', ' ') # remove newline chars
self.incorrect_answer_generator = IncorrectAnswerGenerator(document) sentences = sent_tokenize(text)
cleaned_text = ""
for sentence in sentences:
# remove non alphanumeric chars
cleaned_sentence = re.sub(r'([^\s\w]|_)+', '', sentence)
for i in range(1, self.num_questions+1): # substitute multiple spaces with single space
if i not in self.questions_dict: cleaned_sentence = re.sub(' +', ' ', cleaned_sentence)
continue cleaned_text += cleaned_sentence
self.questions_dict[i]["options"] \
= self.incorrect_answer_generator.get_all_options_dict(
self.questions_dict[i]["answer"],
self.num_options
)
return self.questions_dict if cleaned_text[-1] == ' ':
cleaned_text[-1] = '.'
else:
cleaned_text += '.'
cleaned_text += ' ' # pad with space at end
return cleaned_text
def generate_questions_dict(self, document):
document = self.clean_text(document)
self.questions_dict = self.question_extractor.get_questions_dict(
document)
self.incorrect_answer_generator = IncorrectAnswerGenerator(document)
for i in range(1, self.num_questions + 1):
if i not in self.questions_dict:
continue
self.questions_dict[i]["options"] \
= self.incorrect_answer_generator.get_all_options_dict(
self.questions_dict[i]["answer"],
self.num_options
)
return self.questions_dict