Source code for SciAssist.pipelines.reference_string_parsing

# main developer: Yixi Ding <dingyixi@hotmail.com>

import json
import os
from typing import List, Tuple, Optional, Union, Dict

from datasets import Dataset
from transformers import PreTrainedTokenizer

from SciAssist import BASE_OUTPUT_DIR, BASE_TEMP_DIR
from SciAssist.datamodules.components.cora_label import LABEL_NAMES
from SciAssist.datamodules.components.cora_label import label2id
from SciAssist.pipelines.pipeline import Pipeline
from SciAssist.utils.pdf2text import process_pdf_file, get_reference
from SciAssist.utils.windows_pdf2text import windows_get_reference


[docs]class ReferenceStringParsing(Pipeline): """ The pipeline for reference string parsing. Args: model_name (`str`, *optional*): A string, the *model name* of a pretrained model provided for this task. device (`str`, *optional*): A string, `cpu` or `gpu`. cache_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model should be cached if the standard cache should not be used. output_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which the predicted results files should be stored. temp_dir (`str` or `os.PathLike`, *optional*): Path to a directory which holds temporary files such as `.tei.xml`. tokenizer (PreTrainedTokenizer, *optional*): A specific tokenizer. checkpoint (`str` or `os.PathLike`, *optional*): A checkpoint for the tokenizer. You can also specify the `checkpoint` while using the default tokenizer. Can be either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `allenai/scibert_scivocab_uncased`. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not applicable to all derived classes) model_max_length (`int`, *optional*): The max sequence length the model accepts. """ def __init__( self, model_name: Optional[str] = "default", device: Optional[str] = "gpu", cache_dir = None, output_dir = None, temp_dir = None, tokenizer: PreTrainedTokenizer = None, checkpoint="allenai/scibert_scivocab_uncased", model_max_length=512, os_name=None, ): super().__init__(task_name="reference-string-parsing", model_name=model_name, device=device, cache_dir=cache_dir, output_dir=output_dir, temp_dir=temp_dir) self.data_utils = self.data_utils( tokenizer=tokenizer, checkpoint=checkpoint, model_max_length=model_max_length ) self.os_name = os_name if os_name != None else os.name
[docs] def predict( self, input, type: str = "pdf", dehyphen=False, output_dir=None, temp_dir=None, save_results=True, ): """ Args: input (`str` or `List[str]` or `os.PathLike`): Can be either: - A string, the reference string to be parsed. - A list of strings to be parsed. - A path to a *.txt* file to be parsed. Each line of the source file contains a reference string. - A path to a *.pdf* file to be parsed, a raw scientific document without processing. The pipeline will automatically extract the reference strings from the pdf. type (`str`, default to `pdf`): The type of input, can be either: - `str` or `string`. - `text`or `txt` for a .txt file. - `pdf` for a pdf file. This is the default value. dehyphen (`bool`, default to `False`): Whether to remove hyphens in raw text. output_dir (`str` or `os.PathLike`, *optional*): Path to a directory in which the predicted results files should be stored. If not provided, it will use the `output_dir` set for the pipeline. temp_dir (`str` or `os.PathLike`, *optional*): Path to a directory which holds temporary files such as `.tei.xml`. If not provided, it will use the `temp_dir` set for the pipeline. save_results (`bool`, default to `True`): Whether to save the results in a *.json* file. **Note**: This is invalid when `type` is set to `str` or `string`. Returns: `List[Dict]`: [{"tagged_text": tagged_text, "tokens": tokens_list ,"tags": tags_list } , ... ] Examples: >>> from SciAssist import ReferenceStringParsing >>> pipeline = ReferenceStringParsing() >>> pipeline.predict( ... "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).", ... type="str" ... ) [{'tagged_text': '<author>Waleed</author> <author>Ammar,</author> <author>Matthew</author> <author>E.</author> <author>Peters,</author> <author>Chandra</author> <author>Bhagavat-</author> <author>ula,</author> <author>and</author> <author>Russell</author> <author>Power.</author> <date>2017.</date> <title>The</title> <title>ai2</title> <title>system</title> <title>at</title> <title>semeval-2017</title> <title>task</title> <title>10</title> <title>(scienceie):</title> <title>semi-supervised</title> <title>end-to-end</title> <title>entity</title> <title>and</title> <title>relation</title> <title>extraction.</title> <booktitle>In</booktitle> <booktitle>ACL</booktitle> <booktitle>workshop</booktitle> <booktitle>(SemEval).</booktitle>', 'tokens': ['Waleed', 'Ammar,', 'Matthew', 'E.', 'Peters,', 'Chandra', 'Bhagavat-', 'ula,', 'and', 'Russell', 'Power.', '2017.', 'The', 'ai2', 'system', 'at', 'semeval-2017', 'task', '10', '(scienceie):', 'semi-supervised', 'end-to-end', 'entity', 'and', 'relation', 'extraction.', 'In', 'ACL', 'workshop', '(SemEval).'], 'tags': ['author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'date', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'booktitle', 'booktitle', 'booktitle', 'booktitle']}] """ if output_dir is None: output_dir = self.output_dir if temp_dir is None: temp_dir = self.temp_dir if type in ["str", "string"]: results = self._predict_for_string(example=input, dehyphen=dehyphen) elif type in ["txt", "text"]: results = self._predict_for_text(filename=input, dehyphen=dehyphen) elif type == "pdf": results = self._predict_for_pdf(filename=input, output_dir=output_dir, temp_dir=temp_dir, dehyphen=dehyphen) # Save predicted results as a text file if save_results and type not in ["str", "string"]: os.makedirs(output_dir, exist_ok=True) output_file = os.path.basename(input) with open(os.path.join(output_dir, f"{output_file[:-4]}_rsp.json"), "w") as output: for res in results: output.write(json.dumps(res) + "\n") return results
def _dehyphen_for_str(self, text: str): text = text.replace("- ", "") text = text.replace("-", " ") return text def _to_device(self, batch): if self.model_name in ["default", "scibert-on-cora"]: return { "input_ids": batch["input_ids"].to(self.device), "token_type_ids": batch["token_type_ids"].to(self.device), "attention_mask": batch["attention_mask"].to(self.device), "token_mapping": batch["token_mapping"].to(self.device) } def _predict(self, examples: List[List[str]]) -> List[Dict]: """ Parse a list of tokens obtained from reference strings. Args: examples (`List[List[str]]`): The inputs for inference, where each item is a list of tokens. Returns: `List[Dict]`: Tagged strings, origin tokens and labels predicted by the model. """ # Prepare the dataset dict_data = {"tokens": examples} dataset = Dataset.from_dict(dict_data) dataloader = self.data_utils.get_dataloader(dataset, label2id=label2id) results = [] true_preds = [] for batch in dataloader: # Predict the labels batch = self._to_device(batch) outputs = self.model(**batch) preds = outputs.logits.argmax(dim=-1) # Convert ids to labels and # merge the labels according to origin tokens. preds = [[LABEL_NAMES[i] for i in pred if i < len(LABEL_NAMES)] for pred in preds] true_preds.extend(preds) tokens = examples # Generate the tagged strings. for i in range(len(tokens)): tagged_words = [] for token, label in zip(tokens[i], true_preds[i]): tagged_word = f"<{label}>{token}</{label}>" tagged_words.append(tagged_word) result = " ".join(tagged_words) results.append( { "tagged_text": result, "tokens": tokens[i], "tags": true_preds[i], } ) return results def _predict_for_string(self, example: Union[str, List[str]], dehyphen: Optional[bool] = False) -> List[Dict]: """ Parse a reference string. Args: example (`Union[str, List[str]]`): The string to parse. dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`. Returns: `List[Dict]`: Tagged string, origin tokens and labels predicted by the model. """ if isinstance(example, list): examples = example else: examples = [example] # remove '-' in text if dehyphen == True: examples = [self._dehyphen_for_str(example) for example in examples] splitted_examples = [example.split() for example in examples] results = self._predict(splitted_examples) return results def _predict_for_text( self, filename: str, dehyphen: Optional[bool] = False, ) -> List[Dict]: """ Parse reference strings from a text and save the result as a text file. Args: filename (`str`): The path to the text file to predict. dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`. Returns: `List[Dict]`: Tagged strings, origin tokens and labels predicted by the model. """ with open(filename, "r") as f: examples = f.readlines() # remove '-' in text if dehyphen == True: examples = [self._dehyphen_for_str(example) for example in examples] splitted_examples = [example.split() for example in examples] results = self._predict(splitted_examples) return results def _predict_for_pdf( self, filename: str, output_dir: Optional[str] = BASE_OUTPUT_DIR, temp_dir: Optional[str] = BASE_TEMP_DIR, dehyphen: Optional[bool] = False, ) -> Tuple[List[str], List[List[str]], List[List[str]]]: """ Parse reference strings from a PDF and save the result as a text file. Args: filename (`str`): The path to the pdf file to parse. output_dir (`Optional[str]`): The directory to save the result file, default to `result/`. temp_dir (`Optional[str]`): The diretorcy to save intermediate file, default to `temp/`. dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`. Returns: `List[Dict]`: Tagged strings, origin tokens and labels predicted by the model. """ if self.os_name == "posix": # Convert PDF to JSON with doc2json. json_file = process_pdf_file(input_file=filename, temp_dir=temp_dir, output_dir=temp_dir) # Extract reference strings from JSON and save them in TEXT format. text_file = get_reference(json_file=json_file, output_dir=output_dir) elif self.os_name == "nt": text_file = windows_get_reference(path=filename, output_dir=output_dir) return self._predict_for_text(text_file, dehyphen=dehyphen)