forked from gptscript-ai/gptscript
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjudge.go
128 lines (107 loc) · 3.22 KB
/
judge.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
package judge
import (
"context"
"encoding/json"
"fmt"
"github.com/getkin/kin-openapi/openapi3gen"
openai "github.com/gptscript-ai/chat-completion-client"
)
const instructions = `"actual" is considered equivalent to "expected" if and only if the following rules are satisfied:
%s
When given JSON objects that conform to the following JSONSchema:
%s
Determine if "actual" is considered equivalent to "expected".
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
{
"name": "ruling",
"type": "object",
"properties": {
"equal": {
"type": "boolean",
"description": "Set to true if and only if actual is considered equal to expected."
},
"reasoning": {
"type": "string",
"description": "The reasoning used to come to the determination"
}
},
"required": [
"equal",
"reasoning"
]
}
Your responses are concise and include only the json object described above.
`
type Judge[T any] struct {
client *openai.Client
comparisonSchema string
}
type comparison[T any] struct {
Expected T `json:"expected"`
Actual T `json:"actual"`
}
type ruling struct {
Equal bool `json:"equal"`
Reasoning string `json:"reasoning"`
}
func New[T any](client *openai.Client) (*Judge[T], error) {
schema, err := openapi3gen.NewSchemaRefForValue(
new(comparison[T]),
nil,
openapi3gen.CreateComponentSchemas(
openapi3gen.ExportComponentSchemasOptions{
ExportComponentSchemas: true,
ExportGenerics: false,
}),
)
if err != nil {
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
}
marshaled, err := json.MarshalIndent(schema, "", " ")
if err != nil {
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
}
return &Judge[T]{
client: client,
comparisonSchema: string(marshaled),
}, nil
}
func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
Expected: expected,
Actual: actual,
}, "", " ")
if err != nil {
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
}
request := openai.ChatCompletionRequest{
Model: openai.GPT4o,
Temperature: new(float32),
N: 1,
ResponseFormat: &openai.ChatCompletionResponseFormat{
Type: openai.ChatCompletionResponseFormatTypeJSONObject,
},
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: fmt.Sprintf(instructions, criteria, j.comparisonSchema),
},
{
Role: openai.ChatMessageRoleUser,
Content: string(comparisonJSON),
},
},
}
response, err := j.client.CreateChatCompletion(ctx, request)
if err != nil {
return false, "", fmt.Errorf("failed to create chat completion request: %w", err)
}
if len(response.Choices) < 1 {
return false, "", fmt.Errorf("chat completion request returned no choices")
}
var equality ruling
if err := json.Unmarshal([]byte(response.Choices[0].Message.Content), &equality); err != nil {
return false, "", fmt.Errorf("failed to unmarshal judge ruling: %w", err)
}
return equality.Equal, equality.Reasoning, nil
}