Skip to content

BART_LARGE_CNN

Download Flojoy Studio to try this app
Take an input dataframe with multiple rows and a single column, then produce a dataframe with a single "summary_text" column. The "summary_text" column contains a summary of the text in the corresponding row of the input dataframe. Params: default : DataFrame The text to summarize. Returns: out : DataFrame dataframe containing the summary text in the "summary_text" column
Python Code
from flojoy import flojoy, DataFrame


@flojoy(deps={"transformers": "4.30.2", "torch": "2.0.1", "torchvision": "0.15.2"})
def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
    """Take an input dataframe with multiple rows and a single column, then produce a dataframe with a single "summary_text" column.

    The "summary_text" column contains a summary of the text in the corresponding row of the input dataframe.

    Parameters
    ----------
    default : DataFrame
        The text to summarize.

    Returns
    -------
    DataFrame
        dataframe containing the summary text in the "summary_text" column
    """

    import torch
    from flojoy import snapshot_download
    from transformers import BartTokenizer, BartForConditionalGeneration
    import pandas as pd

    input_df = default.m

    assert (
        len(input_df.columns.tolist()) == 1
    ), "Can only take a single-column dataframe as input"

    # Load the repo from either the local cache or from the web, and get the local path
    local_path = snapshot_download(
        repo_id="facebook/bart-large-cnn", revision="3d22493"
    )

    # Load the pre-trained BART model
    model = BartForConditionalGeneration.from_pretrained(local_path)
    tokenizer = BartTokenizer.from_pretrained(local_path)

    def _chunk_text(text):
        inputs_no_trunc = tokenizer(
            text, max_length=None, return_tensors="pt", truncation=False
        )
        chunks = []
        step = 1024
        # step = tokenizer.model_max_length - 1
        for i in range(0, len(inputs_no_trunc["input_ids"][0]), step):
            chunk = inputs_no_trunc["input_ids"][0][i : i + step]
            chunks.append(torch.unsqueeze(chunk, 0))
        return chunks

    def _summarize_text(text):
        chunks = _chunk_text(text)
        summary_ids = [
            model.generate(
                chunk,
                num_beams=4,
                max_length=1024 // 2,
                early_stopping=True,
            )
            for chunk in chunks
        ]
        summaries = [
            "\n".join(
                [
                    tokenizer.decode(
                        g, skip_special_tokens=True, clean_up_tokenization_spaces=False
                    )
                    for g in id
                ]
            )
            for id in summary_ids
        ]
        return "\n".join(summaries)

    column = input_df.columns[0]

    with torch.inference_mode():
        output_df = pd.DataFrame(
            input_df[column].apply(_summarize_text).rename("summary_text")
        )
    return DataFrame(df=output_df)

Find this Flojoy Block on GitHub

Example App

Having problems with this example app? Join our Discord community and we will help you out!
React Flow mini map

In this example, a READ_CSV node reads out a single-column dataframe containing a large text as a single row. Our BART_LARGE_CNN node produces an equivalent output dataframe containing the corresponding summaries.