# main developer: Yixi Ding <dingyixi@hotmail.com>
import os
from typing import List, Tuple, Optional, Dict
from datasets import Dataset
from SciAssist import BASE_TEMP_DIR, BASE_OUTPUT_DIR
from SciAssist.pipelines.pipeline import Pipeline
from SciAssist.utils.pdf2text import process_pdf_file, get_bodytext
[docs]class SingleSummarization(Pipeline):
"""
The pipeline for single document summarization.
Args:
model_name (`str`, *optional*):
A string, the *model name* of a pretrained model provided for this task.
device (`str`, *optional*):
A string, `cpu` or `gpu`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model should be
cached if the standard cache should not be used.
output_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which the predicted results files should be stored.
temp_dir (`str` or `os.PathLike`, *optional*):
Path to a directory which holds temporary files such as `.tei.xml`.
tokenizer (PreTrainedTokenizer, *optional*):
A specific tokenizer.
checkpoint (`str` or `os.PathLike`, *optional*):
A checkpoint for the tokenizer. You can also specify the `checkpoint` while
using the default tokenizer.
Can be either:
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
user or organization name, like `facebook/bart-large-cnn`.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
applicable to all derived classes)
model_max_length (`int`, *optional*): The max sequence length the model accepts.
max_source_length (`int`, *optional*): The max length of the input text.
max_target_length (`int`, *optional*): The max length of the generated summary.
"""
def __init__(
self, model_name: str = "default", device="gpu",
cache_dir=None,
output_dir=None,
temp_dir=None,
tokenizer=None,
checkpoint="facebook/bart-large-cnn",
model_max_length=1024,
max_source_length=1024,
max_target_length=128
):
super().__init__(task_name="single-doc-summarization", model_name=model_name, device=device,
cache_dir=cache_dir, output_dir=output_dir, temp_dir=temp_dir)
self.data_utils = self.data_utils(
tokenizer=tokenizer,
model_class=self.config["model"],
checkpoint=checkpoint,
model_max_length=model_max_length,
max_source_length=max_source_length,
max_target_length=max_target_length
)
self.tokenizer = self.data_utils.tokenizer
[docs] def predict(
self, input: str, type: str = "pdf",
output_dir=None,
temp_dir=None,
num_beams=1,
num_return_sequences=1,
save_results=True,
):
"""
Args:
input (`str` or `List[str]` or `os.PathLike`):
Can be either:
- A string, the reference string to be parsed.
- A list of strings to be parsed.
- A path to a *.txt* file to be summarized.
- A path to a *.pdf* file to be summarized, a raw scientific document without processing.
The pipeline will automatically extract the body text from the pdf.
type (`str`, default to `pdf`):
The type of input, can be either:
- `str` or `string`.
- `text`or `txt` for a .txt file.
- `pdf` for a pdf file. This is the default value.
output_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which the predicted results files should be stored.
If not provided, it will use the `output_dir` set for the pipeline.
temp_dir (`str` or `os.PathLike`, *optional*):
Path to a directory which holds temporary files such as `.tei.xml`.
If not provided, it will use the `temp_dir` set for the pipeline.
num_beams (`int`, *optional*):
Number of beams for beam search. 1 means no beam search.
`num_beams` should be divisible by `num_return_sequences` for group beam search.
num_return_sequences(`int`):
The number of independently computed returned sequences for each element in the batch.
save_results (`bool`, default to `True`):
Whether to save the tagged labels in a *.txt* file.
**Note**: This is invalid when `type` is set to `str` or `string`.
Returns:
`Dict`: { "summary": [summary1, summary2, ...], "raw_text": raw_text }
Examples:
>>> from SciAssist import SingleSummarization
>>> pipeline = SingleSummarization()
>>> res = pipeline.predict('N18-3011.pdf', type="pdf", num_beams=4, num_return_sequences=2)
>>> res["summary"]
['The paper proposes a method for extracting structured information from scientific documents into the literature graph. The paper describes the attributes associated with nodes and edges of different types in the graph, and describes how to extract the entities mentioned in paper text. The method is evaluated on three tasks: sequence labeling, entity linking and relation extraction. ',
'The paper proposes a method for extracting structured information from scientific documents into the literature graph. The paper describes the attributes associated with nodes and edges of different types in the graph, and describes how to extract the entities mentioned in paper text. The method is evaluated on three tasks: sequence labeling, entity linking and relation extraction. ']
"""
if output_dir is None:
output_dir = self.output_dir
if temp_dir is None:
temp_dir = self.temp_dir
if type in ["str", "string"]:
results = self._summarize_for_string(example=input, num_beams=num_beams,
num_return_sequences=num_return_sequences)
elif type in ["txt", "text"]:
results = self._summarize_for_text(filename=input, num_beams=num_beams,
num_return_sequences=num_return_sequences)
elif type == "pdf":
results = self._summarize_for_pdf(filename=input, output_dir=output_dir, temp_dir=temp_dir,
num_beams=num_beams, num_return_sequences=num_return_sequences)
# Save predicted results as a text file
if save_results and type not in ["str", "string"]:
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.basename(input)
with open(os.path.join(output_dir, f"{output_file[:-4]}_summ.txt"), "w") as output:
for res in results["summary"]:
output.write(res + "\n")
return results
def _to_device(self, batch):
if self.model_name in ["default", "bart-cnn-on-mup"]:
return {
"input_ids": batch["input_ids"].to(self.device),
"attention_mask": batch["attention_mask"].to(self.device),
}
def _summarize(
self,
examples: List[str],
num_beams=1,
num_return_sequences=1
) -> List[str]:
"""
Summarize each text in the list.
Args:
examples(`List[str]`): A list of texts to be summarized
num_beams(`int`): Number of beams for beam search. 1 means no beam search.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`List[str]`: A list of the summarization, each item corresponds to a text in the list.
"""
# Prepare the dataset
dict_data = {"text": examples}
dataset = Dataset.from_dict(dict_data)
# Tokenize for Bart, get input_ids and attention_masks
dataloader = self.data_utils.get_dataloader(dataset)
results = []
for batch in dataloader:
batch = self._to_device(batch)
# Get token ids of summary
pred = self.model.generate(batch["input_ids"], batch["attention_mask"], num_beams, num_return_sequences)
# Convert token ids to text
decoded_preds = self.tokenizer.batch_decode(pred, skip_special_tokens=True)
results.extend(decoded_preds)
return results
def _summarize_for_string(
self,
example: str,
num_beams=1,
num_return_sequences=1
) -> Tuple[str, str]:
"""
Summarize a text in string format.
Args:
example (`str`): The string to summarize.
num_beams (`int`): Number of beams for beam search. 1 means no beam search.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Tuple[str, str]`:
Predicted summarization and source text.
"""
res = self._summarize([example], num_beams, num_return_sequences)
return {"summary": res, "raw_text": example}
def _summarize_for_text(
self,
filename: str,
num_beams: int = 1,
num_return_sequences: int = 1
) -> Tuple[str, str]:
"""
Summarize a document from a text file.
Args:
num_beams (`int`): Number of beams for beam search. 1 means no beam search.
filename (`str`): The path to the input text file.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Tuple[str, str]`:
Predicted summarization and source text.
"""
with open(filename, "r") as f:
examples = f.readlines()
examples = [" ".join(examples)]
res = self._summarize(examples, num_beams, num_return_sequences)
return {"summary": res, "raw_text": examples[0]}
def _summarize_for_pdf(
self,
filename: str,
temp_dir: Optional[str] = BASE_TEMP_DIR,
output_dir: Optional[str] = BASE_OUTPUT_DIR,
num_beams: int = 1,
num_return_sequences=1
) -> Dict:
"""
Summarize a document from a PDF file.
Args:
filename (`str`): The path to the pdf file to summarize.
temp_dir (`Optional[str]`): The diretorcy to save intermediate file, default to `temp/`.
output_dir (`Optional[str]`): The diretorcy to save text file, default to `output/`.
num_return_sequences(`int`): The number of independently computed returned sequences for each element in the batch.
Returns:
`Dict`:
Predicted summarization and source text.
"""
# Convert PDF to JSON with doc2json.
json_file = process_pdf_file(input_file=filename, temp_dir=temp_dir, output_dir=temp_dir)
# Extract bodytext from pdf and save them in TEXT format.
text_file = get_bodytext(json_file=json_file, output_dir=output_dir)
# Do summarization
return self._summarize_for_text(text_file, num_beams=num_beams, num_return_sequences=num_return_sequences)