Source code for SciAssist.pipelines.reference_string_parsing

# main developer: Yixi Ding <dingyixi@hotmail.com>

import json
import os
from typing import List, Tuple, Optional, Union, Dict

from datasets import Dataset
from transformers import PreTrainedTokenizer

from SciAssist import BASE_OUTPUT_DIR, BASE_TEMP_DIR
from SciAssist.datamodules.components.cora_label import LABEL_NAMES
from SciAssist.datamodules.components.cora_label import label2id
from SciAssist.pipelines.pipeline import Pipeline
from SciAssist.utils.pdf2text import process_pdf_file, get_reference
from SciAssist.utils.windows_pdf2text import windows_get_reference


[docs]class ReferenceStringParsing(Pipeline):
    """
    The pipeline for reference string parsing.

    Args:
        model_name (`str`, *optional*):
            A string, the *model name* of a pretrained model provided for this task.
        device (`str`, *optional*):
            A string, `cpu` or `gpu`.
        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model should be
            cached if the standard cache should not be used.
        output_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which the predicted results files should be stored.
        temp_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory which holds temporary files such as `.tei.xml`.
        tokenizer (PreTrainedTokenizer, *optional*):
            A specific tokenizer.
        checkpoint (`str` or `os.PathLike`, *optional*):
            A checkpoint for the tokenizer. You can also specify the `checkpoint` while
            using the default tokenizer.
            Can be either:

                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                  user or organization name, like `allenai/scibert_scivocab_uncased`.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                  using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                  single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                  applicable to all derived classes)

        model_max_length (`int`, *optional*): The max sequence length the model accepts.
    """

    def __init__(
            self, model_name: Optional[str] = "default", device: Optional[str] = "gpu",
            cache_dir = None,
            output_dir = None,
            temp_dir = None,
            tokenizer: PreTrainedTokenizer = None,
            checkpoint="allenai/scibert_scivocab_uncased",
            model_max_length=512,
            os_name=None,
    ):

        super().__init__(task_name="reference-string-parsing", model_name=model_name, device=device,
                         cache_dir=cache_dir, output_dir=output_dir, temp_dir=temp_dir)

        self.data_utils = self.data_utils(
            tokenizer=tokenizer,
            checkpoint=checkpoint,
            model_max_length=model_max_length
        )
        self.os_name = os_name if os_name != None else os.name

[docs]    def predict(
            self, input, type: str = "pdf", dehyphen=False,
            output_dir=None,
            temp_dir=None,
            save_results=True,
    ):

        """

        Args:
            input (`str` or `List[str]` or `os.PathLike`):
                Can be either:

                    - A string, the reference string to be parsed.
                    - A list of strings to be parsed.
                    - A path to a *.txt* file to be parsed. Each line of the source file contains a
                      reference string.
                    - A path to a *.pdf* file to be parsed, a raw scientific document without processing.
                      The pipeline will automatically extract the reference strings from the pdf.

            type (`str`, default to `pdf`):
                The type of input, can be either:

                    - `str` or `string`.
                    - `text`or `txt` for a .txt file.
                    - `pdf` for a pdf file. This is the default value.

            dehyphen (`bool`, default to `False`):
                Whether to remove hyphens in raw text.
            output_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which the predicted results files should be stored.
                If not provided, it will use the `output_dir` set for the pipeline.
            temp_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory which holds temporary files such as `.tei.xml`.
                If not provided, it will use the `temp_dir` set for the pipeline.
            save_results (`bool`, default to `True`):
                Whether to save the results in a *.json* file.
                **Note**: This is invalid when `type` is set to `str` or `string`.

        Returns:
            `List[Dict]`: [{"tagged_text": tagged_text, "tokens": tokens_list ,"tags": tags_list } , ... ]


        Examples:

            >>> from SciAssist import ReferenceStringParsing
            >>> pipeline = ReferenceStringParsing()
            >>> pipeline.predict(
            ...     "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
            ...     type="str"
            ... )
            [{'tagged_text': '<author>Waleed</author> <author>Ammar,</author> <author>Matthew</author> <author>E.</author> <author>Peters,</author> <author>Chandra</author> <author>Bhagavat-</author> <author>ula,</author> <author>and</author> <author>Russell</author> <author>Power.</author> <date>2017.</date> <title>The</title> <title>ai2</title> <title>system</title> <title>at</title> <title>semeval-2017</title> <title>task</title> <title>10</title> <title>(scienceie):</title> <title>semi-supervised</title> <title>end-to-end</title> <title>entity</title> <title>and</title> <title>relation</title> <title>extraction.</title> <booktitle>In</booktitle> <booktitle>ACL</booktitle> <booktitle>workshop</booktitle> <booktitle>(SemEval).</booktitle>',
            'tokens': ['Waleed', 'Ammar,', 'Matthew', 'E.', 'Peters,', 'Chandra', 'Bhagavat-', 'ula,', 'and', 'Russell', 'Power.', '2017.', 'The', 'ai2', 'system', 'at', 'semeval-2017', 'task', '10', '(scienceie):', 'semi-supervised', 'end-to-end', 'entity', 'and', 'relation', 'extraction.', 'In', 'ACL', 'workshop', '(SemEval).'],
            'tags': ['author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'author', 'date', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'title', 'booktitle', 'booktitle', 'booktitle', 'booktitle']}]

        """
        if output_dir is None:
            output_dir = self.output_dir
        if temp_dir is None:
            temp_dir = self.temp_dir

        if type in ["str", "string"]:
            results = self._predict_for_string(example=input, dehyphen=dehyphen)
        elif type in ["txt", "text"]:
            results = self._predict_for_text(filename=input, dehyphen=dehyphen)
        elif type == "pdf":
            results = self._predict_for_pdf(filename=input, output_dir=output_dir, temp_dir=temp_dir, dehyphen=dehyphen)
        # Save predicted results as a text file
        if save_results and type not in ["str", "string"]:
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.basename(input)
            with open(os.path.join(output_dir, f"{output_file[:-4]}_rsp.json"), "w") as output:
                for res in results:
                    output.write(json.dumps(res) + "\n")

        return results

    def _dehyphen_for_str(self, text: str):
        text = text.replace("- ", "")
        text = text.replace("-", " ")
        return text

    def _to_device(self, batch):
        if self.model_name in ["default", "scibert-on-cora"]:
            return {
                "input_ids": batch["input_ids"].to(self.device),
                "token_type_ids": batch["token_type_ids"].to(self.device),
                "attention_mask": batch["attention_mask"].to(self.device),
                "token_mapping": batch["token_mapping"].to(self.device)
            }

    def _predict(self, examples: List[List[str]]) -> List[Dict]:
        """
        Parse a list of tokens obtained from reference strings.

        Args:
            examples (`List[List[str]]`):
                The inputs for inference, where each item is a list of tokens.

        Returns:
            `List[Dict]`:
                Tagged strings, origin tokens and labels predicted by the model.

        """

        # Prepare the dataset
        dict_data = {"tokens": examples}
        dataset = Dataset.from_dict(dict_data)

        dataloader = self.data_utils.get_dataloader(dataset, label2id=label2id)

        results = []
        true_preds = []
        for batch in dataloader:
            # Predict the labels

            batch = self._to_device(batch)
            outputs = self.model(**batch)
            preds = outputs.logits.argmax(dim=-1)
            # Convert ids to labels and
            # merge the labels according to origin tokens.
            preds = [[LABEL_NAMES[i] for i in pred if i < len(LABEL_NAMES)] for pred in preds]
            true_preds.extend(preds)

        tokens = examples

        # Generate the tagged strings.
        for i in range(len(tokens)):
            tagged_words = []
            for token, label in zip(tokens[i], true_preds[i]):
                tagged_word = f"<{label}>{token}</{label}>"
                tagged_words.append(tagged_word)
            result = " ".join(tagged_words)
            results.append(
                {
                    "tagged_text": result,
                    "tokens": tokens[i],
                    "tags": true_preds[i],
                }
            )
        return results

    def _predict_for_string(self, example: Union[str, List[str]], dehyphen: Optional[bool] = False) -> List[Dict]:
        """
        Parse a reference string.

        Args:
            example (`Union[str, List[str]]`): The string to parse.
            dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`.
        Returns:
           `List[Dict]`:
                Tagged string, origin tokens and labels predicted by the model.

        """

        if isinstance(example, list):
            examples = example
        else:
            examples = [example]

        # remove '-' in text
        if dehyphen == True:
            examples = [self._dehyphen_for_str(example) for example in examples]

        splitted_examples = [example.split() for example in examples]
        results = self._predict(splitted_examples)

        return results

    def _predict_for_text(
            self,
            filename: str,
            dehyphen: Optional[bool] = False,
    ) -> List[Dict]:
        """

        Parse reference strings from a text and save the result as a text file.

        Args:
            filename (`str`): The path to the text file to predict.
            dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`.

        Returns:
            `List[Dict]`:
                Tagged strings, origin tokens and labels predicted by the model.

        """

        with open(filename, "r") as f:
            examples = f.readlines()

        # remove '-' in text
        if dehyphen == True:
            examples = [self._dehyphen_for_str(example) for example in examples]

        splitted_examples = [example.split() for example in examples]
        results = self._predict(splitted_examples)

        return results

    def _predict_for_pdf(
            self,
            filename: str,
            output_dir: Optional[str] = BASE_OUTPUT_DIR,
            temp_dir: Optional[str] = BASE_TEMP_DIR,
            dehyphen: Optional[bool] = False,
    ) -> Tuple[List[str], List[List[str]], List[List[str]]]:
        """
        Parse reference strings from a PDF and save the result as a text file.

        Args:
            filename (`str`): The path to the pdf file to parse.
            output_dir (`Optional[str]`): The directory to save the result file, default to `result/`.
            temp_dir (`Optional[str]`): The diretorcy to save intermediate file, default to `temp/`.
            dehyphen (`Optional[bool]`): Whether to remove '-', default to `False`.

        Returns:
           `List[Dict]`:
                Tagged strings, origin tokens and labels predicted by the model.
        """
        if self.os_name == "posix":
            # Convert PDF to JSON with doc2json.
            json_file = process_pdf_file(input_file=filename, temp_dir=temp_dir, output_dir=temp_dir)
            # Extract reference strings from JSON and save them in TEXT format.
            text_file = get_reference(json_file=json_file, output_dir=output_dir)
        elif self.os_name == "nt":
            text_file = windows_get_reference(path=filename, output_dir=output_dir)


        return self._predict_for_text(text_file, dehyphen=dehyphen)