forked from LAION-AI/Open-Assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscoring.py
184 lines (151 loc) · 7.42 KB
/
scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from dataclasses import dataclass, replace
from typing import Any
import numpy as np
import numpy.typing as npt
from scipy.stats import kendalltau
@dataclass
class Voter:
"""
Represents a single voter.
This tabulates the number of good votes, total votes,
and points.
We only put well-behaved people on the scoreboard and filter out the badly behaved ones
"""
uid: Any
num_votes: int
num_good_votes: int
num_prompts: int
num_good_prompts: int
num_rankings: int
num_good_rankings: int
#####################
voting_points: int
prompt_points: int
ranking_points: int
def voter_quality(self):
return self.num_good_votes / self.num_votes
def rank_quality(self):
return self.num_good_rankings / self.num_rankings
def prompt_quality(self):
return self.num_good_prompts / self.num_prompts
def is_well_behaved(self, threshhold_vote, threshhold_prompt, threshhold_rank):
return (
self.voter_quality() > threshhold_vote
and self.prompt_quality() > threshhold_prompt
and self.rank_quality() > threshhold_rank
)
def total_points(self, voting_weight, prompt_weight, ranking_weight):
return (
voting_weight * self.voting_points
+ prompt_weight * self.prompt_points
+ ranking_weight * self.ranking_points
)
def score_update_votes(new_vote: int, consensus: npt.ArrayLike, voter_data: Voter) -> Voter:
"""
This function returns the new "quality score" and points for a voter,
after that voter cast a vote on a question.
This function is only to be run when archiving a question
i.e. the question has had sufficiently many votes, or we can't get more than "K" bits of information
The consensus is the array of all votes cast by all voters for that question
We then update the voter data using the new information
Parameters:
new_vote (int): the index of the vote cast by the voter
consensus (ArrayLike): all votes cast for this question
voter_data (Voter): a "Voter" object that represents the person casting the "new_vote"
Returns:
updated_voter (Voter): the new "quality score" and points for the voter
"""
# produces the ranking of votes, e.g. for [100,300,200] it returns [0, 2, 1],
# since 100 is the lowest, 300 the highest and 200 the middle value
consensus_ranking = np.argsort(np.argsort(consensus))
new_points = consensus_ranking[new_vote] + voter_data.voting_points
# we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus,
# it's a good vote
new_good_votes = int(consensus_ranking[new_vote] > (len(consensus) - 1) / 2) + voter_data.num_good_votes
new_num_votes = voter_data.num_votes + 1
return replace(voter_data, num_votes=new_num_votes, num_good_votes=new_good_votes, voting_points=new_points)
def score_update_prompts(consensus: npt.ArrayLike, voter_data: Voter) -> Voter:
"""
This function returns the gain of points for a given prompt's votes
In contrast to the other score updating functions, we can run this online as new votes come in.
i.e. the question has had sufficiently many votes, or we can't get more than "K" bits of information.
Parameters:
consensus (ArrayLike): all votes cast for this question
voter_data (Voter): a "Voter" object that represents the person that wrote the prompt
Returns:
updated_voter (Voter): the new "quality score" and points for the voter
"""
# produces the ranking of votes, e.g. for [100,300,200] it returns [0, 2, 1],
# since 100 is the lowest, 300 the highest and 200 the middle value
consensus_ranking = np.arange(len(consensus)) - len(consensus) // 2 + 1
# expected consenus ranking (i.e. normalize the votes and multiply-sum with weightings)
delta_votes = np.sum(consensus_ranking * consensus / sum(consensus))
new_points = delta_votes + voter_data.prompt_points
# we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus,
# it's a good vote
new_good_prompts = int(delta_votes > 0) + voter_data.num_good_prompts
new_num_prompts = voter_data.num_prompts + 1
return replace(
voter_data,
num_prompts=new_num_prompts,
num_good_prompts=new_good_prompts,
prompt_points=new_points,
)
def score_update_ranking(user_ranking: npt.ArrayLike, consensus_ranking: npt.ArrayLike, voter_data: Voter) -> Voter:
"""
This function returns the gain of points for a given ranking's votes
This function is only to be run when archiving a question
i.e. the question has had sufficiently many votes, or we can't get more than "K" bits of information
we use the bubble-sort distance (or "kendall-tau" distance) to compare the two rankings
we use this over spearman correlation since:
"[Kendall's τ] approaches a normal distribution more rapidly than ρ, as N, the sample size, increases;
and τ is also more tractable mathematically, particularly when ties are present"
Gilpin, A. R. (1993). Table for conversion of Kendall's Tau to Spearman's
Rho within the context measures of magnitude of effect for meta-analysis
Further in
"research design and statistical analyses, second edition, 2003"
the authors note that at least from an significance test POV they will yield the same p-values
Parameters:
user_ranking (ArrayLike): ranking produced by the user
consensus (ArrayLike): ranking produced after running the voting algorithm to merge into the consensus ranking
voter_data (Voter): a "Voter" object that represents the person that wrote the prompt
Returns:
updated_voter (Voter): the new "quality score" and points for the voter
"""
bubble_sort_distance, p_value = kendalltau(user_ranking, consensus_ranking)
# normalize kendall-tau from [-1,1] into [0,1] range
bubble_sort_distance = (1 + bubble_sort_distance) / 2
new_points = bubble_sort_distance + voter_data.ranking_points
new_good_rankings = int(bubble_sort_distance > 0.5) + voter_data.num_good_rankings
new_num_rankings = voter_data.num_rankings + 1
return replace(
voter_data,
num_rankings=new_num_rankings,
num_good_rankings=new_good_rankings,
ranking_points=new_points,
)
if __name__ == "__main__":
demo_voter = Voter(
"abc",
num_votes=10,
num_good_votes=2,
num_prompts=10,
num_good_prompts=2,
num_rankings=10,
num_good_rankings=2,
voting_points=6,
prompt_points=0,
ranking_points=0,
)
new_vote = 3
consensus = np.array([200, 300, 100, 500])
print(demo_voter)
print("best vote ", score_update_votes(new_vote, consensus, demo_voter))
new_vote = 2
print("worst vote ", score_update_votes(new_vote, consensus, demo_voter))
new_vote = 1
print("medium vote ", score_update_votes(new_vote, consensus, demo_voter))
print("prompt writer", score_update_prompts(consensus, demo_voter))
print("best rank ", score_update_ranking(np.array([0, 2, 1]), np.array([0, 2, 1]), demo_voter))
print("medium rank ", score_update_ranking(np.array([2, 0, 1]), np.array([0, 2, 1]), demo_voter))
print("worst rank ", score_update_ranking(np.array([1, 0, 2]), np.array([0, 2, 1]), demo_voter))