Source code for SciAssist.pipelines.summarization

# main developer: Yixi Ding <dingyixi@hotmail.com>
import json
import math
import os
from typing import List, Tuple, Optional, Dict

from datasets import Dataset

from SciAssist import BASE_TEMP_DIR, BASE_OUTPUT_DIR
from SciAssist.pipelines.pipeline import Pipeline
from SciAssist.pipelines.testing_pipeline import test
from SciAssist.utils.pdf2text import process_pdf_file, get_bodytext
from SciAssist.utils.windows_pdf2text import windows_get_bodytext


[docs]class Summarization(Pipeline): """ The pipeline for single document summarization. Args: model_name (`str`, *optional*): A string, the *model name* of a pretrained model provided for this task. device (`str`, *optional*): A string, `cpu` or `gpu`. cache_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model should be cached if the standard cache should not be used. output_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which the predicted results files should be stored. temp_dir (`str` or `os.PathLike`, *optional*): Path to a directory which holds temporary files such as `.tei.xml`. tokenizer (PreTrainedTokenizer, *optional*): A specific tokenizer. checkpoint (`str` or `os.PathLike`, *optional*): A checkpoint for the tokenizer. You can also specify the `checkpoint` while using the default tokenizer. Can be either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `facebook/bart-large-cnn`. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not applicable to all derived classes) model_max_length (`int`, *optional*): The max sequence length the model accepts. max_source_length (`int`, *optional*): The max length of the input text. max_target_length (`int`, *optional*): The max length of the generated summary. """ def __init__( self, model_name: str = "default", device="gpu", task_name = "controlled-summarization", cache_dir=None, output_dir=None, temp_dir=None, tokenizer=None, checkpoint="google/flan-t5-base", model_max_length=1024, max_source_length=1024, max_target_length=500, os_name=None, ): super().__init__(task_name=task_name, model_name=model_name, device=device, cache_dir=cache_dir, output_dir=output_dir, temp_dir=temp_dir) self.data_utils = self.data_utils( tokenizer=tokenizer, model_class=self.config["model"], checkpoint=checkpoint, model_max_length=model_max_length, max_source_length=max_source_length, max_target_length=max_target_length ) self.tokenizer = self.data_utils.tokenizer self.os_name = os_name if os_name != None else os.name
[docs] def predict( self, input: str, type: str = "pdf", output_dir=None, temp_dir=None, num_beams=1, num_return_sequences=1, save_results=True, length=100, keywords: List[str] = None ): """ Args: input (`str` or `List[str]` or `os.PathLike`): Can be either: - A string, the reference string to be parsed. - A list of strings to be parsed. - A path to a *.txt* file to be summarized. - A path to a *.pdf* file to be summarized, a raw scientific document without processing. The pipeline will automatically extract the body text from the pdf. type (`str`, default to `pdf`): The type of input, can be either: - `str` or `string`. - `text`or `txt` for a .txt file. - `pdf` for a pdf file. This is the default value. output_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which the predicted results files should be stored. If not provided, it will use the `output_dir` set for the pipeline. temp_dir (`str` or `os.PathLike`, *optional*): Path to a directory which holds temporary files such as `.tei.xml`. If not provided, it will use the `temp_dir` set for the pipeline. num_beams (`int`, *optional*): Number of beams for beam search. 1 means no beam search. `num_beams` should be divisible by `num_return_sequences` for group beam search. num_return_sequences(`int`, *optional*): The number of independently computed returned sequences for each element in the batch. save_results (`bool`, default to `True`): Whether to save the results in a *.json* file. **Note**: This is invalid when `type` is set to `str` or `string`. length(`int`, default to `100`): The expected number of words in the summary. The value should be in [50, 100, 150, 200, 250] to ensure the controllability. keywords(`List[str]`, default to None): The keywords you want to appear in thee summary. Returns: `Dict`: { "summary": [summary1, summary2, ...], "raw_text": raw_text } Examples: >>> from SciAssist import Summarization >>> summarizer = Summarization() >>> res = summarizer.predict('Bert_paper.pdf', type="pdf", length=50, keywords=["Cloze task"]) >>> res["summary"] ['This paper proposes a bidirectional pre-training method for language representations. The method is inspired by the Cloze task. The method is evaluated on a large suite of sentence-level and token-level tasks.'] """ if output_dir is None: output_dir = self.output_dir if temp_dir is None: temp_dir = self.temp_dir if type in ["str", "string"]: results = self._summarize_for_string(example=input, num_beams=num_beams, num_return_sequences=num_return_sequences,length=length, keywords=keywords) elif type in ["txt", "text"]: results = self._summarize_for_text(filename=input, num_beams=num_beams, num_return_sequences=num_return_sequences,length=length, keywords=keywords) elif type == "pdf": results = self._summarize_for_pdf(filename=input, output_dir=output_dir, temp_dir=temp_dir, num_beams=num_beams, num_return_sequences=num_return_sequences, length=length, keywords=keywords) # Save predicted results as a text file if save_results and type not in ["str", "string"]: os.makedirs(output_dir, exist_ok=True) output_file = os.path.basename(input) with open(os.path.join(output_dir, f"{output_file[:-4]}_summ.json"), "w") as output: output.write(json.dumps(results) + "\n") return results
def _to_device(self, batch): if self.model_name in ["default", "bart-cnn-on-mup", "flan-t5", "t5"]: return { "input_ids": batch["input_ids"].to(self.device), "attention_mask": batch["attention_mask"].to(self.device), } def _summarize( self, examples: List[str], num_beams=1, num_return_sequences=1, length=100, keywords=None ) -> List[str]: """ Summarize each text in the list. Args: examples(`List[str]`): A list of texts to be summarized num_beams(`int`): Number of beams for beam search. 1 means no beam search. num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch. Returns: `List[str]`: A list of the summarization, each item corresponds to a text in the list. """ # Prepare the dataset dict_data = {"text": examples, "length": [length]*len(examples), "keywords": [keywords]*len(examples) } dataset = Dataset.from_dict(dict_data) # Tokenize for Bart, get input_ids and attention_masks dataloader = self.data_utils.get_dataloader(dataset) results = [] for batch in dataloader: batch = self._to_device(batch) # Get token ids of summary pred = self.model.generate(batch["input_ids"], batch["attention_mask"], num_beams, num_return_sequences) # Convert token ids to text decoded_preds = self.tokenizer.batch_decode(pred, skip_special_tokens=True) results.extend(decoded_preds) return results def _summarize_for_string( self, example: str, num_beams=1, num_return_sequences=1, length=100, keywords=None ) -> Tuple[str, str]: """ Summarize a text in string format. Args: example (`str`): The string to summarize. num_beams (`int`): Number of beams for beam search. 1 means no beam search. num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch. Returns: `Tuple[str, str]`: Predicted summarization and source text. """ num = 10 res = self._summarize([example], num_beams, num_return_sequences,length=length, keywords=keywords) if length is not None: num = 5*math.ceil(length/50) # if keywords is not None: # example = extract_related_sentences(example,keywords[0],num) return {"summary": res, "raw_text": example} def _summarize_for_text( self, filename: str, num_beams: int = 1, num_return_sequences: int = 1, length=100, keywords=None ) -> Tuple[str, str]: """ Summarize a document from a text file. Args: num_beams (`int`): Number of beams for beam search. 1 means no beam search. filename (`str`): The path to the input text file. num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch. Returns: `Tuple[str, str]`: Predicted summarization and source text. """ num = 10 if length is not None: num = 5 * math.ceil(length / 50) with open(filename, "r") as f: examples = f.readlines() examples = [" ".join(examples)] res = self._summarize(examples, num_beams, num_return_sequences,length=length,keywords=keywords) # if keywords is not None: # examples = [extract_related_sentences(examples[0], keywords[0],num)] return {"summary": res, "raw_text": examples[0]} def _summarize_for_pdf( self, filename: str, temp_dir: Optional[str] = BASE_TEMP_DIR, output_dir: Optional[str] = BASE_OUTPUT_DIR, num_beams: int = 1, num_return_sequences=1, length = 100, keywords = None ) -> Dict: """ Summarize a document from a PDF file. Args: filename (`str`): The path to the pdf file to summarize. temp_dir (`Optional[str]`): The diretorcy to save intermediate file, default to `temp/`. output_dir (`Optional[str]`): The diretorcy to save text file, default to `output/`. num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch. Returns: `Dict`: Predicted summarization and source text. """ if self.os_name == "posix": # Convert PDF to JSON with doc2json. json_file = process_pdf_file(input_file=filename, temp_dir=temp_dir, output_dir=temp_dir) # Extract bodytext from pdf and save them in TEXT format. text_file = get_bodytext(json_file=json_file, output_dir=output_dir) elif self.os_name == "nt": text_file = windows_get_bodytext(path=filename, output_dir=output_dir) # Do summarization return self._summarize_for_text(text_file, num_beams=num_beams, num_return_sequences=num_return_sequences, length=length, keywords=keywords) def evaluate(self): return test()