# main developer: Yixi Ding <dingyixi@hotmail.com>
import json
import math
import os
from typing import List, Tuple, Optional, Dict
from datasets import Dataset
from SciAssist import BASE_TEMP_DIR, BASE_OUTPUT_DIR
from SciAssist.pipelines.pipeline import Pipeline
from SciAssist.pipelines.testing_pipeline import test
from SciAssist.utils.pdf2text import process_pdf_file, get_bodytext
from SciAssist.utils.windows_pdf2text import windows_get_bodytext
[docs]class Summarization(Pipeline):
"""
The pipeline for single document summarization.
Args:
model_name (`str`, *optional*):
A string, the *model name* of a pretrained model provided for this task.
device (`str`, *optional*):
A string, `cpu` or `gpu`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model should be
cached if the standard cache should not be used.
output_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which the predicted results files should be stored.
temp_dir (`str` or `os.PathLike`, *optional*):
Path to a directory which holds temporary files such as `.tei.xml`.
tokenizer (PreTrainedTokenizer, *optional*):
A specific tokenizer.
checkpoint (`str` or `os.PathLike`, *optional*):
A checkpoint for the tokenizer. You can also specify the `checkpoint` while
using the default tokenizer.
Can be either:
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
user or organization name, like `facebook/bart-large-cnn`.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
applicable to all derived classes)
model_max_length (`int`, *optional*): The max sequence length the model accepts.
max_source_length (`int`, *optional*): The max length of the input text.
max_target_length (`int`, *optional*): The max length of the generated summary.
"""
def __init__(
self, model_name: str = "default", device="gpu",
task_name = "controlled-summarization",
cache_dir=None,
output_dir=None,
temp_dir=None,
tokenizer=None,
checkpoint="google/flan-t5-base",
model_max_length=1024,
max_source_length=1024,
max_target_length=500,
os_name=None,
):
super().__init__(task_name=task_name, model_name=model_name, device=device,
cache_dir=cache_dir, output_dir=output_dir, temp_dir=temp_dir)
self.data_utils = self.data_utils(
tokenizer=tokenizer,
model_class=self.config["model"],
checkpoint=checkpoint,
model_max_length=model_max_length,
max_source_length=max_source_length,
max_target_length=max_target_length
)
self.tokenizer = self.data_utils.tokenizer
self.os_name = os_name if os_name != None else os.name
[docs] def predict(
self, input: str, type: str = "pdf",
output_dir=None,
temp_dir=None,
num_beams=1,
num_return_sequences=1,
save_results=True,
length=100,
keywords: List[str] = None
):
"""
Args:
input (`str` or `List[str]` or `os.PathLike`):
Can be either:
- A string, the reference string to be parsed.
- A list of strings to be parsed.
- A path to a *.txt* file to be summarized.
- A path to a *.pdf* file to be summarized, a raw scientific document without processing.
The pipeline will automatically extract the body text from the pdf.
type (`str`, default to `pdf`):
The type of input, can be either:
- `str` or `string`.
- `text`or `txt` for a .txt file.
- `pdf` for a pdf file. This is the default value.
output_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which the predicted results files should be stored.
If not provided, it will use the `output_dir` set for the pipeline.
temp_dir (`str` or `os.PathLike`, *optional*):
Path to a directory which holds temporary files such as `.tei.xml`.
If not provided, it will use the `temp_dir` set for the pipeline.
num_beams (`int`, *optional*):
Number of beams for beam search. 1 means no beam search.
`num_beams` should be divisible by `num_return_sequences` for group beam search.
num_return_sequences(`int`, *optional*):
The number of independently computed returned sequences for each element in the batch.
save_results (`bool`, default to `True`):
Whether to save the results in a *.json* file.
**Note**: This is invalid when `type` is set to `str` or `string`.
length(`int`, default to `100`):
The expected number of words in the summary. The value should be in [50, 100, 150, 200, 250] to ensure the controllability.
keywords(`List[str]`, default to None):
The keywords you want to appear in thee summary.
Returns:
`Dict`: { "summary": [summary1, summary2, ...], "raw_text": raw_text }
Examples:
>>> from SciAssist import Summarization
>>> summarizer = Summarization()
>>> res = summarizer.predict('Bert_paper.pdf', type="pdf", length=50, keywords=["Cloze task"])
>>> res["summary"]
['This paper proposes a bidirectional pre-training method for language representations. The method is inspired by the Cloze task. The method is evaluated on a large suite of sentence-level and token-level tasks.']
"""
if output_dir is None:
output_dir = self.output_dir
if temp_dir is None:
temp_dir = self.temp_dir
if type in ["str", "string"]:
results = self._summarize_for_string(example=input, num_beams=num_beams,
num_return_sequences=num_return_sequences,length=length, keywords=keywords)
elif type in ["txt", "text"]:
results = self._summarize_for_text(filename=input, num_beams=num_beams,
num_return_sequences=num_return_sequences,length=length, keywords=keywords)
elif type == "pdf":
results = self._summarize_for_pdf(filename=input, output_dir=output_dir, temp_dir=temp_dir,
num_beams=num_beams, num_return_sequences=num_return_sequences,
length=length, keywords=keywords)
# Save predicted results as a text file
if save_results and type not in ["str", "string"]:
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.basename(input)
with open(os.path.join(output_dir, f"{output_file[:-4]}_summ.json"), "w") as output:
output.write(json.dumps(results) + "\n")
return results
def _to_device(self, batch):
if self.model_name in ["default", "bart-cnn-on-mup", "flan-t5", "t5"]:
return {
"input_ids": batch["input_ids"].to(self.device),
"attention_mask": batch["attention_mask"].to(self.device),
}
def _summarize(
self,
examples: List[str],
num_beams=1,
num_return_sequences=1,
length=100,
keywords=None
) -> List[str]:
"""
Summarize each text in the list.
Args:
examples(`List[str]`): A list of texts to be summarized
num_beams(`int`): Number of beams for beam search. 1 means no beam search.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`List[str]`: A list of the summarization, each item corresponds to a text in the list.
"""
# Prepare the dataset
dict_data = {"text": examples, "length": [length]*len(examples), "keywords": [keywords]*len(examples) }
dataset = Dataset.from_dict(dict_data)
# Tokenize for Bart, get input_ids and attention_masks
dataloader = self.data_utils.get_dataloader(dataset)
results = []
for batch in dataloader:
batch = self._to_device(batch)
# Get token ids of summary
pred = self.model.generate(batch["input_ids"], batch["attention_mask"], num_beams, num_return_sequences)
# Convert token ids to text
decoded_preds = self.tokenizer.batch_decode(pred, skip_special_tokens=True)
results.extend(decoded_preds)
return results
def _summarize_for_string(
self,
example: str,
num_beams=1,
num_return_sequences=1,
length=100,
keywords=None
) -> Tuple[str, str]:
"""
Summarize a text in string format.
Args:
example (`str`): The string to summarize.
num_beams (`int`): Number of beams for beam search. 1 means no beam search.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Tuple[str, str]`:
Predicted summarization and source text.
"""
num = 10
res = self._summarize([example], num_beams, num_return_sequences,length=length, keywords=keywords)
if length is not None:
num = 5*math.ceil(length/50)
# if keywords is not None:
# example = extract_related_sentences(example,keywords[0],num)
return {"summary": res, "raw_text": example}
def _summarize_for_text(
self,
filename: str,
num_beams: int = 1,
num_return_sequences: int = 1,
length=100,
keywords=None
) -> Tuple[str, str]:
"""
Summarize a document from a text file.
Args:
num_beams (`int`): Number of beams for beam search. 1 means no beam search.
filename (`str`): The path to the input text file.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Tuple[str, str]`:
Predicted summarization and source text.
"""
num = 10
if length is not None:
num = 5 * math.ceil(length / 50)
with open(filename, "r") as f:
examples = f.readlines()
examples = [" ".join(examples)]
res = self._summarize(examples, num_beams, num_return_sequences,length=length,keywords=keywords)
# if keywords is not None:
# examples = [extract_related_sentences(examples[0], keywords[0],num)]
return {"summary": res, "raw_text": examples[0]}
def _summarize_for_pdf(
self,
filename: str,
temp_dir: Optional[str] = BASE_TEMP_DIR,
output_dir: Optional[str] = BASE_OUTPUT_DIR,
num_beams: int = 1,
num_return_sequences=1,
length = 100,
keywords = None
) -> Dict:
"""
Summarize a document from a PDF file.
Args:
filename (`str`): The path to the pdf file to summarize.
temp_dir (`Optional[str]`): The diretorcy to save intermediate file, default to `temp/`.
output_dir (`Optional[str]`): The diretorcy to save text file, default to `output/`.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Dict`:
Predicted summarization and source text.
"""
if self.os_name == "posix":
# Convert PDF to JSON with doc2json.
json_file = process_pdf_file(input_file=filename, temp_dir=temp_dir, output_dir=temp_dir)
# Extract bodytext from pdf and save them in TEXT format.
text_file = get_bodytext(json_file=json_file, output_dir=output_dir)
elif self.os_name == "nt":
text_file = windows_get_bodytext(path=filename, output_dir=output_dir)
# Do summarization
return self._summarize_for_text(text_file, num_beams=num_beams, num_return_sequences=num_return_sequences, length=length, keywords=keywords)
def evaluate(self):
return test()