From 90d7df45afd6f2d7c377d4de7933326a5f249e40 Mon Sep 17 00:00:00 2001 From: telescopic Date: Thu, 8 Oct 2020 18:04:15 +0530 Subject: [PATCH] Feat: Question Extraction - Extract questions from raw text --- question_extraction.py | 199 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 question_extraction.py diff --git a/question_extraction.py b/question_extraction.py new file mode 100644 index 0000000..d9e938f --- /dev/null +++ b/question_extraction.py @@ -0,0 +1,199 @@ +'''This file contains the module for generating +''' +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import sent_tokenize, word_tokenize +from sklearn.feature_extraction.text import TfidfVectorizer +import spacy + + +class QuestionExtraction: + ''' This class contains all the methods + required for extracting questions from + a given document + ''' + + def __init__(self, num_questions): + + self.num_questions = num_questions + + # hash set for fast lookup + self.stop_words = set(stopwords.words('english')) + + # named entity recognition tagger + self.ner_tagger = spacy.load('en_core_web_md') + + self.vectorizer = TfidfVectorizer() + + self.questions_dict = dict() + + def get_questions(self, document): + ''' + Returns a dict of questions in the format: + question_number: { + question: str + answer: str + } + + Params: + * document : string + Returns: + * dict + ''' + # find candidate keywords + self.candidate_keywords = self.get_candidate_entities(document) + + # set word scores before ranking candidate keywords + self.set_tfidf_scores(document) + + # rank the keywords using calculated tf idf scores + self.rank_keywords() + + # form the questions + self.form_questions() + + return self.questions_dict + + def get_filtered_sentences(self, document): + ''' Returns a list of sentences - each of + which has been cleaned of stopwords. + Params: + * document: a paragraph of sentences + Returns: + * list : list of string + ''' + sentences = sent_tokenize(document) # split documents into sentences + + return [self.filter_sentence(sentence) for sentence in sentences] + + def filter_sentence(self, sentence): + '''Returns the sentence without stopwords + Params: + * sentence: A string + Returns: + * string + ''' + words = word_tokenize(sentence) + return ' '.join(w for w in words if w not in self.stop_words) + + def get_candidate_entities(self, document): + ''' Returns a list of entities according to + spacy's ner tagger. These entities are candidates + for the questions + + Params: + * document : string + Returns: + * list + ''' + entities = self.ner_tagger(document) + entity_list = [] + + for ent in entities.ents: + entity_list.append(ent.text) + + return list(set(entity_list)) # remove duplicates + + def set_tfidf_scores(self, document): + ''' Sets the tf-idf scores for each word''' + self.unfiltered_sentences = sent_tokenize(document) + self.filtered_sentences = self.get_filtered_sentences(document) + print(self.unfiltered_sentences) + self.word_score = dict() # (word, score) + + # (word, sentence where word score is max) + self.sentence_for_max_word_score = dict() + + tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences) + feature_names = self.vectorizer.get_feature_names() + tf_idf_matrix = tf_idf_vector.todense().tolist() + + num_sentences = len(self.unfiltered_sentences) + num_features = len(feature_names) + + for i in range(num_features): + word = feature_names[i] + self.sentence_for_max_word_score[word] = "" + tot = 0.0 + cur_max = 0.0 + + for j in range(num_sentences): + tot += tf_idf_matrix[j][i] + + if tf_idf_matrix[j][i] > cur_max: + cur_max = tf_idf_matrix[j][i] + self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j] + + # average score for each word + self.word_score[word] = tot / num_sentences + + def get_keyword_score(self, keyword): + ''' Returns the score for a keyword + Params: + * keyword : string of possible several words + Returns: + * float : score + ''' + score = 0.0 + for word in word_tokenize(keyword): + if word in self.word_score: + score += self.word_score[word] + return score + + def get_corresponding_sentence_for_keyword(self, keyword): + ''' Finds and returns a sentence containing + the keywords + ''' + words = word_tokenize(keyword) + for word in words: + + if word not in self.sentence_for_max_word_score: + continue + + sentence = self.sentence_for_max_word_score[word] + + all_present = True + for w in words: + if w not in sentence: + all_present = False + + if all_present: + return sentence + return "" + + def rank_keywords(self): + '''Rank keywords according to their score''' + self.candidate_triples = [] # (score, keyword, corresponding sentence) + + for candidate_keyword in self.candidate_keywords: + self.candidate_triples.append([ + self.get_keyword_score(candidate_keyword), + candidate_keyword, + self.get_corresponding_sentence_for_keyword(candidate_keyword) + ]) + + self.candidate_triples.sort(reverse=True) + + def form_questions(self): + ''' Forms the question and populates + the question dict + ''' + used_sentences = list() + idx = 0 + cntr = 1 + num_candidates = len(self.candidate_triples) + while cntr <= self.num_questions and idx < num_candidates: + candidate_triple = self.candidate_triples[idx] + + if candidate_triple[2] not in used_sentences: + used_sentences.append(candidate_triple[2]) + + self.questions_dict[cntr] = { + "question": candidate_triple[2].replace( + candidate_triple[1], + '_' * len(candidate_triple[1])), + "answer": candidate_triple[1] + } + + cntr += 1 + idx += 1