llm-building-blocks-for-python-course/02-score.py at main · talkpython/llm-building-blocks-for-python-course · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import marimo

__generated_with = "0.13.6"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import polars as pl
    import llm
    from dotenv import load_dotenv

    load_dotenv(".env")
    return llm, mo, pl


@app.cell
def _(pl):
    df = pl.read_csv("spam.csv")
    df.head(200).group_by("label").len()
    return (df,)


@app.cell
async def _():
    import asyncio
    from mosync import async_map_with_retry


    async def delayed_double(x):
        await asyncio.sleep(1)
        return x * 2

    results = await async_map_with_retry(
        range(100),
        delayed_double,
        max_concurrency=10,
        description="Showing a simple demo"
    )
    return (async_map_with_retry,)


@app.cell
def _(llm):
    for model in llm.get_async_models():
        print(model.model_id)
    return


@app.cell
def _(llm):
    from diskcache import Cache

    cache = Cache("accuracy-experiment")

    models = {
        "gpt-4": llm.get_async_model("gpt-4"),
        "gpt-4o": llm.get_async_model("gpt-4o"),
    }


    prompt = "is this spam or ham? only reply with spam or ham"
    mod = "gpt-4o"

    async def classify(text, prompt=prompt, model=mod):
        tup = (text, prompt, model)
        if tup in cache:
            return cache[tup]
        resp = await models[model].prompt(prompt + "\n" + text).json()
        cache[tup] = resp
        return resp
    return classify, prompt


@app.cell
async def _(classify):
    await classify("hello there")
    return


@app.cell
async def _(async_map_with_retry, classify, df):
    n_eval = 200

    llm_results = await async_map_with_retry(
        [_["text"] for _ in df.head(n_eval).to_dicts()],
        classify,
        max_concurrency=3,
        description="Running LLM experiments"
    )
    return llm_results, n_eval


@app.cell
def _(df, llm_results, mo, n_eval, pl, prompt):
    n_correct = pl.DataFrame({**d, "pred": p} for d, p in zip(
        df.head(200).to_dicts(),
        [i.result["content"] for i in llm_results]
    )).filter(pl.col("label") == pl.col("pred")).shape[0]

    mo.md(f"""
    ### Prompt:
    ```
    {prompt}
    ```
    The accuracy is {n_correct}/{n_eval} = {n_correct/n_eval*100:.1f}%
    """)
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    Let's jot down some summaries.

    - "is this spam or ham? only reply with spam or ham" / `gpt-4` `67.0%`
    - "is this spam or ham? only reply with spam or ham" / `gpt-4o` `67.5%`
    - "sometimes we need to deal with spammy text messages, that often promise free/cheap good. is this spam or ham? only reply with spam or ham" / `gpt-4` `66.5%`
    - "sometimes we need to deal with spammy text messages, that often promise free/cheap good. is this spam or ham? only reply with spam or ham" / `gpt-4o` `72.5%`
    """
    )
    return


@app.cell
def _(df):
    df
    return


@app.cell
def _(mo):
    mo.md("""Running this experiment cost me about $2. In fairness: I had to rerun a few things a few times. But at the same time: that's pretty darn expensive for 6 variants on just 200 examples! Especially when you consider you could also build a spaCy/scikit-learn pipeline for this task.""")
    return


@app.cell
def _(df):
    import numpy as np
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_extraction.text import CountVectorizer

    df_valid, df_train = df.head(200), df.tail(200)
    text_valid = df_valid["text"].to_list()
    text_train = df_train["text"].to_list()
    y_valid = df_valid["label"].to_list()
    y_train = df_train["label"].to_list()

    pipe = make_pipeline(CountVectorizer(), LogisticRegression())

    # It's pretty dang accurate
    preds = pipe.fit(text_train, y_train).predict(text_valid)
    np.mean(preds == np.array(y_valid))
    return


if __name__ == "__main__":
    app.run()