diff --git a/incorrect_answer_generation.py b/incorrect_answer_generation.py new file mode 100644 index 0000000..b1e0f4f --- /dev/null +++ b/incorrect_answer_generation.py @@ -0,0 +1,60 @@ +''' This module contains the class +for generating incorrect alternative +answers for a given answer +''' +import gensim +import gensim.downloader as api +from gensim.models import Word2Vec +from nltk.tokenize import sent_tokenize, word_tokenize +import random +import numpy as np + +class IncorrectAnswerGenerator: + ''' This class contains the methods + for generating the incorrect answers + given an answer + ''' + + def __init__(self, document): + # model required to fetch similar words + self.model = api.load("glove-wiki-gigaword-100") + self.all_words = [] + for sent in sent_tokenize(document): + self.all_words.extend(word_tokenize(sent)) + self.all_words = list(set(self.all_words)) + + def get_all_options_dict(self, answer, num_options): + ''' This method returns a dict + of 'num_options' options out of + which one is correct and is the answer + ''' + options_dict = dict() + try: + similar_words = self.model.similar_by_word(answer, topn=15)[::-1] + + for i in range(1, num_options + 1): + options_dict[i] = similar_words[i - 1][0] + + except: + self.all_sim = [] + for word in self.all_words: + if word not in answer: + try: + self.all_sim.append( + (self.model.similarity(answer, word), word)) + except: + self.all_sim.append( + (0.0, word)) + else: + self.all_sim.append((-1.0, word)) + + self.all_sim.sort(reverse=True) + + for i in range(1, num_options+1): + options_dict[i] = self.all_sim[i-1][1] + + replacement_idx = random.randint(1, num_options) + + options_dict[replacement_idx] = answer + + return options_dict diff --git a/question_extraction.py b/question_extraction.py new file mode 100644 index 0000000..84d8d82 --- /dev/null +++ b/question_extraction.py @@ -0,0 +1,199 @@ +'''This file contains the module for generating +''' +import nltk +import spacy +from nltk.corpus import stopwords +from nltk.tokenize import sent_tokenize, word_tokenize +from sklearn.feature_extraction.text import TfidfVectorizer + + +class QuestionExtractor: + ''' This class contains all the methods + required for extracting questions from + a given document + ''' + + def __init__(self, num_questions): + + self.num_questions = num_questions + + # hash set for fast lookup + self.stop_words = set(stopwords.words('english')) + + # named entity recognition tagger + self.ner_tagger = spacy.load('en_core_web_md') + + self.vectorizer = TfidfVectorizer() + + self.questions_dict = dict() + + def get_questions_dict(self, document): + ''' + Returns a dict of questions in the format: + question_number: { + question: str + answer: str + } + + Params: + * document : string + Returns: + * dict + ''' + # find candidate keywords + self.candidate_keywords = self.get_candidate_entities(document) + + # set word scores before ranking candidate keywords + self.set_tfidf_scores(document) + + # rank the keywords using calculated tf idf scores + self.rank_keywords() + + # form the questions + self.form_questions() + + return self.questions_dict + + def get_filtered_sentences(self, document): + ''' Returns a list of sentences - each of + which has been cleaned of stopwords. + Params: + * document: a paragraph of sentences + Returns: + * list : list of string + ''' + sentences = sent_tokenize(document) # split documents into sentences + + return [self.filter_sentence(sentence) for sentence in sentences] + + def filter_sentence(self, sentence): + '''Returns the sentence without stopwords + Params: + * sentence: A string + Returns: + * string + ''' + words = word_tokenize(sentence) + return ' '.join(w for w in words if w not in self.stop_words) + + def get_candidate_entities(self, document): + ''' Returns a list of entities according to + spacy's ner tagger. These entities are candidates + for the questions + + Params: + * document : string + Returns: + * list + ''' + entities = self.ner_tagger(document) + entity_list = [] + + for ent in entities.ents: + entity_list.append(ent.text) + + return list(set(entity_list)) # remove duplicates + + def set_tfidf_scores(self, document): + ''' Sets the tf-idf scores for each word''' + self.unfiltered_sentences = sent_tokenize(document) + self.filtered_sentences = self.get_filtered_sentences(document) + + self.word_score = dict() # (word, score) + + # (word, sentence where word score is max) + self.sentence_for_max_word_score = dict() + + tf_idf_vector = self.vectorizer.fit_transform(self.filtered_sentences) + feature_names = self.vectorizer.get_feature_names() + tf_idf_matrix = tf_idf_vector.todense().tolist() + + num_sentences = len(self.unfiltered_sentences) + num_features = len(feature_names) + + for i in range(num_features): + word = feature_names[i] + self.sentence_for_max_word_score[word] = "" + tot = 0.0 + cur_max = 0.0 + + for j in range(num_sentences): + tot += tf_idf_matrix[j][i] + + if tf_idf_matrix[j][i] > cur_max: + cur_max = tf_idf_matrix[j][i] + self.sentence_for_max_word_score[word] = self.unfiltered_sentences[j] + + # average score for each word + self.word_score[word] = tot / num_sentences + + def get_keyword_score(self, keyword): + ''' Returns the score for a keyword + Params: + * keyword : string of possible several words + Returns: + * float : score + ''' + score = 0.0 + for word in word_tokenize(keyword): + if word in self.word_score: + score += self.word_score[word] + return score + + def get_corresponding_sentence_for_keyword(self, keyword): + ''' Finds and returns a sentence containing + the keywords + ''' + words = word_tokenize(keyword) + for word in words: + + if word not in self.sentence_for_max_word_score: + continue + + sentence = self.sentence_for_max_word_score[word] + + all_present = True + for w in words: + if w not in sentence: + all_present = False + + if all_present: + return sentence + return "" + + def rank_keywords(self): + '''Rank keywords according to their score''' + self.candidate_triples = [] # (score, keyword, corresponding sentence) + + for candidate_keyword in self.candidate_keywords: + self.candidate_triples.append([ + self.get_keyword_score(candidate_keyword), + candidate_keyword, + self.get_corresponding_sentence_for_keyword(candidate_keyword) + ]) + + self.candidate_triples.sort(reverse=True) + + def form_questions(self): + ''' Forms the question and populates + the question dict + ''' + used_sentences = list() + idx = 0 + cntr = 1 + num_candidates = len(self.candidate_triples) + while cntr <= self.num_questions and idx < num_candidates: + candidate_triple = self.candidate_triples[idx] + + if candidate_triple[2] not in used_sentences: + used_sentences.append(candidate_triple[2]) + + self.questions_dict[cntr] = { + "question": candidate_triple[2].replace( + candidate_triple[1], + '_' * len(candidate_triple[1])), + "answer": candidate_triple[1] + } + + cntr += 1 + idx += 1 diff --git a/question_generation_main.py b/question_generation_main.py new file mode 100644 index 0000000..1cc3419 --- /dev/null +++ b/question_generation_main.py @@ -0,0 +1,31 @@ +'''This module ties together the +questions generation and incorrect answer +generation modules +''' +from question_extraction import QuestionExtractor +from incorrect_answer_generation import IncorrectAnswerGenerator + +class QuestionGeneration: + '''This class contains the method + to generate questions + ''' + + def __init__(self, num_questions, num_options): + self.num_questions = num_questions + self.num_options = num_options + self.question_extractor = QuestionExtractor(num_questions) + + def generate_questions_dict(self, document): + self.questions_dict = self.question_extractor.get_questions_dict(document) + self.incorrect_answer_generator = IncorrectAnswerGenerator(document) + + for i in range(1, self.num_questions+1): + if i not in self.questions_dict: + continue + self.questions_dict[i]["options"] \ + = self.incorrect_answer_generator.get_all_options_dict( + self.questions_dict[i]["answer"], + self.num_options + ) + + return self.questions_dict