Skip to content

Commit 1e321a6

Browse files
authored
Fix typos (LAION-AI#1143)
Found via `codespell -S .mypy_cache,yarn.lock,*.json,*.ipynb -L rouge,nam,vie`
1 parent 364a4f5 commit 1e321a6

File tree

32 files changed

+50
-50
lines changed

32 files changed

+50
-50
lines changed

ansible/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
To test the ansible playbook on localhost run
22
`ansible-playbook -i test.inventory.ini dev.yaml`.\
3-
In case you're missing the ansible docker depencency install it with `ansible-galaxy collection install community.docker`.\
3+
In case you're missing the ansible docker dependency install it with `ansible-galaxy collection install community.docker`.\
44
Point Redis Insights to the Redis database by visiting localhost:8001 in a
55
browser and select "I already have a database" followed by "Connect to a Redis
66
Database".\

backend/oasst_backend/api/v1/hugging_face.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ async def get_text_toxicity(
1818
1919
Args:
2020
msg (str): the message that we want to analyze.
21-
api_client (ApiClient, optional): authentification of the user of the request.
21+
api_client (ApiClient, optional): authentication of the user of the request.
2222
Defaults to Depends(deps.get_trusted_api_client).
2323
2424
Returns:

backend/oasst_backend/config.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class TreeManagerConfiguration(BaseModel):
1414
number is reached."""
1515

1616
max_initial_prompt_review: int = 100
17-
"""Maximum number of initial prompts under review before no more inital prompt tasks will be handed out."""
17+
"""Maximum number of initial prompts under review before no more initial prompt tasks will be handed out."""
1818

1919
max_tree_depth: int = 3
2020
"""Maximum depth of message tree."""
@@ -75,7 +75,7 @@ class TreeManagerConfiguration(BaseModel):
7575

7676
min_active_rankings_per_lang: int = 0
7777
"""When the number of active ranking tasks is below this value when a tree enters a terminal
78-
state an available trees in BACKLOG_RANKING will be actived (i.e. enters the RANKING state)."""
78+
state an available trees in BACKLOG_RANKING will be activated (i.e. enters the RANKING state)."""
7979

8080
labels_initial_prompt: list[TextLabel] = [
8181
TextLabel.spam,

backend/oasst_backend/models/message_tree_state.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class State(str, Enum):
1212
"""States of the Open-Assistant message tree state machine."""
1313

1414
INITIAL_PROMPT_REVIEW = "initial_prompt_review"
15-
"""In this state the message tree consists only of a single inital prompt root node.
15+
"""In this state the message tree consists only of a single initial prompt root node.
1616
Initial prompt labeling tasks will determine if the tree goes into `growing` or
1717
`aborted_low_grade` state."""
1818

@@ -33,11 +33,11 @@ class State(str, Enum):
3333
compute the aggergated ranking scores that will appear in the dataset."""
3434

3535
READY_FOR_EXPORT = "ready_for_export"
36-
"""The Scoring algorithm computed rankings scores for all childern. The message tree can be
36+
"""The Scoring algorithm computed rankings scores for all children. The message tree can be
3737
exported as part of an Open-Assistant message tree dataset."""
3838

3939
SCORING_FAILED = "scoring_failed"
40-
"""An exception occured in the scoring algorithm."""
40+
"""An exception occurred in the scoring algorithm."""
4141

4242
ABORTED_LOW_GRADE = "aborted_low_grade"
4343
"""The system received too many bad reviews and stopped handing out tasks for this message tree."""

backend/oasst_backend/prompt_repository.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ def store_text_labels(self, text_labels: protocol_schema.TextLabels) -> tuple[Te
484484
OasstErrorCode.TASK_PAYLOAD_TYPE_MISMATCH,
485485
)
486486

487-
logger.debug(f"text_labels relpy: {valid_labels=}, {mandatory_labels=}")
487+
logger.debug(f"text_labels reply: {valid_labels=}, {mandatory_labels=}")
488488

489489
if valid_labels:
490490
if not all([label in valid_labels for label in text_labels.labels.keys()]):

backend/oasst_backend/task_repository.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def close_task(self, frontend_message_id: str, allow_personal_tasks: bool = Fals
177177
if not allow_personal_tasks and not task.collective:
178178
raise OasstError("This is not a collective task", OasstErrorCode.TASK_NOT_COLLECTIVE)
179179
if task.done:
180-
raise OasstError("Allready closed", OasstErrorCode.TASK_ALREADY_DONE)
180+
raise OasstError("Already closed", OasstErrorCode.TASK_ALREADY_DONE)
181181

182182
task.done = True
183183
self.db.add(task)

backend/oasst_backend/tree_manager.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def _auto_moderation(self, lang: str) -> None:
320320
if num_red_flag is not None and num_red_flag >= self.cfg.auto_mod_red_flags:
321321
if m.parent_id is None:
322322
logger.warning(
323-
f"[AUTO MOD] Halting tree {m.message_tree_id}, inital prompt got too many red flags ({m.emojis})."
323+
f"[AUTO MOD] Halting tree {m.message_tree_id}, initial prompt got too many red flags ({m.emojis})."
324324
)
325325
self.enter_low_grade_state(m.message_tree_id)
326326
else:
@@ -895,7 +895,7 @@ def update_message_ranks(
895895
logger.warning("The intersection of ranking results ID sets has less than two elements. Skipping.")
896896
continue
897897

898-
# keep only elements in commond set
898+
# keep only elements in command set
899899
ordered_ids_list = [list(filter(lambda x: x in common_set, ids)) for ids in ordered_ids_list]
900900
assert all(len(x) == len(common_set) for x in ordered_ids_list)
901901

@@ -1069,7 +1069,7 @@ def query_replies_need_review(self, lang: str) -> list[Message]:
10691069
"""
10701070

10711071
def query_incomplete_rankings(self, lang: str) -> list[IncompleteRankingsRow]:
1072-
"""Query parents which have childern that need further rankings"""
1072+
"""Query parents which have children that need further rankings"""
10731073

10741074
user_id = self.pr.user_id if not settings.DEBUG_ALLOW_DUPLICATE_TASKS else None
10751075
r = self.db.execute(
@@ -1256,7 +1256,7 @@ def query_tree_ranking_results(
12561256

12571257
@managed_tx_method(CommitMode.COMMIT)
12581258
def ensure_tree_states(self) -> None:
1259-
"""Add message tree state rows for all root nodes (inital prompt messages)."""
1259+
"""Add message tree state rows for all root nodes (initial prompt messages)."""
12601260

12611261
missing_tree_ids = self.query_misssing_tree_states()
12621262
for id in missing_tree_ids:
@@ -1598,7 +1598,7 @@ def purge_user_messages(
15981598
total_messages = sum(len(x) for x in replies_by_tree.values())
15991599
logger.debug(f"found: {len(replies_by_tree)} trees; {len(prompts)} prompts; {total_messages} messages;")
16001600

1601-
# remove all trees based on inital prompts of the user
1601+
# remove all trees based on initial prompts of the user
16021602
if purge_initial_prompts:
16031603
for p in prompts:
16041604
self.purge_message_tree(p.message_tree_id)
@@ -1636,7 +1636,7 @@ def is_descendant_of_deleted(m: Message) -> bool:
16361636
logger.debug(f"purging message: {m.id}")
16371637
self._purge_message_internal(m.id)
16381638

1639-
# update childern counts
1639+
# update children counts
16401640
self.pr.update_children_counts(m.message_tree_id)
16411641

16421642
# reactivate tree

backend/oasst_backend/utils/ranking.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_winner(pairs):
6666

6767
def get_ranking(pairs):
6868
"""
69-
Abuses concordance property to get a (not necessarily unqiue) ranking.
69+
Abuses concordance property to get a (not necessarily unique) ranking.
7070
The lack of uniqueness is due to the potential existence of multiple
7171
equally ranked winners. We have to pick one, which is where
7272
the non-uniqueness comes from

copilot/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ This will create a variety of aws roles and services needed for deployment.
2929
copilot deploy
3030
```
3131

32-
This will depoy the services but it won't be 100% ready for usage. Before being
32+
This will deploy the services but it won't be 100% ready for usage. Before being
3333
ready, we have to inspect the AWS Secrets manager and extract out the database
3434
credentials. Read those credentials then put them, and a few other secrets, in a
3535
`secrets.yml` file like the following:

docs/docs/data/schemas.mdx

+1-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ message GenerationExample {
182182

183183
class RankingExample:
184184
thread: Thread # The conversation thread before the message to be ranked
185-
messages: list[Message] # The messages to be ranked, in oder of decreasing preference
185+
messages: list[Message] # The messages to be ranked, in order of decreasing preference
186186

187187
```
188188

docs/docs/data/supervised-datasets.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,16 @@ help.
1414
There are two large-scale projects in the area of instruction-following /
1515
multitask learning: Promptsource and Natural Instructions - these projects
1616
crowdsourced templates and turned existing NLP datasets into
17-
instruction-following seq2seq form in natural langauge. They include both
17+
instruction-following seq2seq form in natural language. They include both
1818
long-output training examples like generating a sentence that is a likely
1919
consequence of sentence in the prompt, and short-output, like rating prediction
2020
from review. (Pre-)training on such datasets should help model understand and
21-
follow instructions and teach it many abilities neccessary to perform a large
22-
set of tasks correctly. However, these data are not dialog-like - they do not
23-
look like a normal conversation.
21+
follow instructions and teach it many abilities necessary to perform a large set
22+
of tasks correctly. However, these data are not dialog-like - they do not look
23+
like a normal conversation.
2424

2525
There are also supervised dialog datasets such as Blended Skill Talk or SODA. In
26-
constrast to instruction-following datasets, dialog data is not as focused on
26+
contrast to instruction-following datasets, dialog data is not as focused on
2727
"academic tasks" or correctness, but encourage the model to respond naturally
2828
like a person would.
2929

model/reward/instructor/TODO.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Some other reward features we can use
22

3-
0. Finish classifcation feature
3+
0. Finish classification feature
44

55
1. Summaries from human feedback
66

model/reward/instructor/rank_datasets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
[] support additional negative samples generated from other models.
1515
1616
For example we can use galactica-125m to generate a TLDR and assume it was
17-
inferior than the human perference one
17+
inferior than the human preference one
1818
1919
2020
"""

model/supervised_finetuning/custom_datasets/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ Issues and TODO:
2323
- ideally we can update the config yaml and new dataset will be download from
2424
hub
2525

26-
- one possible idea is we upload the trasform format of these dataset to the
26+
- one possible idea is we upload the transform format of these dataset to the
2727
OA hub

model/supervised_finetuning/custom_datasets/qa_datasets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def __init__(self, cache_dir) -> None:
314314
for line in f:
315315
data = json.loads(line)
316316
joke = data["joke"]
317-
explanation = data["explaination"]
317+
explanation = data["explanation"]
318318
self.pairs.append((joke, explanation))
319319

320320
if len(question) > 0 and len(answer) > 0:

notebooks/closed-book-qa/T5_closed_book_QA_generators.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
try:
8484
objects.append(pickle.load(openfile))
8585
except EOFError:
86-
print("Problem laoding your pickle file, using the default array")
86+
print("Problem loading your pickle file, using the default array")
8787
pickle_fail = True
8888
break
8989

@@ -92,7 +92,7 @@
9292
paragraphs = [
9393
"Like for me, this thing is like a little side hobby, but it's also one that's deeply fulfilling. So not just from a business perspective, which is not the way I think about it. I just think from a life human perspective, it's I probably wouldn't have this kind of conversation with you off mic, like this long, this deep, this attentive. There's something really fulfilling about these conversations. So what advice would you have for me? What advice do you have for yourself? Oh, have you not introspected this that deeply? Oh, I have advice. I think the first advice I would give to you is I think you should have me on more often. Yeah. Yeah. That's first and foremost. And second is go on your podcast and have a conversation. Well, I would say you come on my podcast when you're ready. Yeah. When you feel like the product that I'm putting out would benefit from your presence and vice versa, not as a favor to a bro, but at the right time.",
9494
"Well, we really are looking through a two dimensional screen until it's what we intuit to be a three dimensional world and also inferring dynamic stuff, making it 4D. Anyway, is it possible to visualize some pretty pictures that give us a deeper sense of the truth of reality? I think that we will incrementally be able to do that. I think that, for example, the picture that we have of electrons and photons interacting and scattering, it may have not been possible until Faraday did all of his experiments and then Maxwell wrote down his equations. And we were then sort of forced by his equations to think in a new way. And then when Planck in 1900, desperate to try to solve the problem of black body radiation, what they call the ultraviolet catastrophe where Newton was predicting infinite energies where there weren't infinite energies in black body radiation. And he in desperation proposed packets of energy. Then once you've done that, and then you have an Einstein come along five years later and show how that explains the photoelectric effect.",
95-
"But man, I don't know how I would feel about just bacteria everywhere. Well, it would be depressing if it was true. I suppose depressing, I don't think, I don't. I don't know what's more depressing, bacteria everywhere or nothing everywhere. Yes, either of them are chilling. Yeah. But whether it's chilling or not, I don't think should force us to change our view about whether it's real or not. Yes. And what I'm saying may or may not be true. So how would you feel if we discovered life on Mars? Absolutely. It sounds like you would be less excited than some others because you're like, well. What I would be most interested in is how similar to life on Earth it would be. It would actually turn into quite a subtle problem because the likelihood of life having gone to and fro between Mars and the Earth is quite, I wouldn't say high, but it's not low, it's quite feasible. And so if we found life on Mars and it had very similar genetic code, but it was slightly different, most people would interpret that immediately as evidence that they've been transit one way or the other and that it was a common origin of life on Mars or on the Earth and it went one way or the other way.",
95+
"But man, I don't know how I would feel about just bacteria everywhere. Well, it would be depressing if it was true. I suppose depressing, I don't think, I don't. I don't know what's more depressing, bacteria everywhere or nothing everywhere. Yes, either of them are chilling. Yeah. But whether it's chilling or not, I don't think should force us to change our view about whether it's real or not. Yes. And what I'm saying may or may not be true. So how would you feel if we discovered life on Mars? Absolutely. It sounds like you would be less excited than some others because you're like, well. What I would be most interested in is how similar to life on Earth it would be. It would actually turn into quite a subtle problem because the likelihood of life having gone to and from between Mars and the Earth is quite, I wouldn't say high, but it's not low, it's quite feasible. And so if we found life on Mars and it had very similar genetic code, but it was slightly different, most people would interpret that immediately as evidence that they've been transit one way or the other and that it was a common origin of life on Mars or on the Earth and it went one way or the other way.",
9696
]
9797

9898
# Make sure no paragraphs are too long for T5. It handles up to 512 tokens context length.

notebooks/data-augmentation/stackexchange-builder/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ Each question and all related answers are on a single line in JSONL format.
8686
#### Table/CSV/Parquet Format
8787

8888
There are a lot more columns left over in the table format. `_q` and `_a` are
89-
suffixes indiciating if the column came from the question or answer table as
89+
suffixes indicating if the column came from the question or answer table as
9090
leftover from a join statement.
9191

9292
```

notebooks/detoxify-evaluation/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ trained on
1515
| multilingual | xlm-roberta-base | Multilingual Toxic Comment Classification |
1616

1717
Unbiased and original models also have a 'small' version - but since normal
18-
models are not memory heavy, and small models perform noticably worse, they are
18+
models are not memory heavy, and small models perform noticeably worse, they are
1919
only described in the notebook
2020

2121
## All tests below were ran on a 3090TI

notebooks/example/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ this project. Please try and follow this structure as closely as possible. While
77
things will not exactly be the same for each notebook some principles we would
88
like to try ensure are:
99

10-
1. Each notebook or collection of related or dependant notebooks should live in
10+
1. Each notebook or collection of related or dependent notebooks should live in
1111
its own folder.
1212
1. Each notebook should have a markdown file with the same name as the notebook
1313
(or README.md if it's a single notebook folder) that explains what the

oasst-shared/oasst_shared/exceptions/oasst_api_error.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class OasstError(Exception):
9595
http_status_code: HTTPStatus
9696

9797
def __init__(self, message: str, error_code: OasstErrorCode, http_status_code: HTTPStatus = HTTPStatus.BAD_REQUEST):
98-
super().__init__(message, error_code, http_status_code) # make excetpion picklable (fill args member)
98+
super().__init__(message, error_code, http_status_code) # make exception picklable (fill args member)
9999
self.message = message
100100
self.error_code = error_code
101101
self.http_status_code = http_status_code

scripts/data-collection/twitter/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ conversation, or at least as a prompt with replies.
5757
guarantee of the quality of the tweets.
5858
- The tweet quality is the other major issue. We can get conversations through
5959
the currently made scripts, but they most likely don't match a useful
60-
instruction -> fulfilment. We are trying to filter the tweets through various
60+
instruction -> fulfillment. We are trying to filter the tweets through various
6161
means such as matching useful hashtags, or by using cosine similarity against
6262
known instructions.
6363
- The modern Twitter API has conversation_id as a field which can be a way to
@@ -68,7 +68,7 @@ conversation, or at least as a prompt with replies.
6868
## TODO
6969

7070
- Write scripts to filter existing conversations into useful instructions ->
71-
fulfilment with hashtags or cosine similarity.
71+
fulfillment with hashtags or cosine similarity.
7272
- Train model to detect if text is a suitable instruction. This could then be
7373
run through the conversations (or full tweet dump) to simplify the process.
7474
Related to issue #143.

scripts/data-collection/twitter/twitter_process_json.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
# This assumes data downloaded from https://archive.org/details/twitterstream
1111
# and that the internal .tar files are extracted locally.
12-
# They are large files so using something like 7Zip or WinRar migth be easier
12+
# They are large files so using something like 7Zip or WinRar might be easier
1313
# than putting all of it in scripts, but it is a possibility.
1414

1515
# I often work in notebooks. If you encounter any issue, please reach out to let me know.

scripts/data_augment/data_augment.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def __init__(self):
9494
def parse_single(self, essay):
9595
instructions = []
9696

97-
# Make stucture error (shuffle one paragraph with another)
97+
# Make structure error (shuffle one paragraph with another)
9898
essay_paragraphs = essay.split("\n\n") # Splitting a String by newline character (\n)
9999

100100
rand1 = random.randint(0, len(essay_paragraphs) - 1)
@@ -424,7 +424,7 @@ def parse(self, codes):
424424

425425

426426
def recognize_entities(text, model, n=4, person="ignore"):
427-
"""Given a text and a model for entity recognition, return the most occuring entites in the text as a string"""
427+
"""Given a text and a model for entity recognition, return the most occurring entities in the text as a string"""
428428
doc = model(text)
429429
if person == "ignore":
430430
ents = Counter([ent.text.strip() for ent in list(doc.ents) if len(ent.text.strip()) >= 5])

scripts/postprocessing/rankings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_winner(pairs):
6666

6767
def get_ranking(pairs):
6868
"""
69-
Abuses concordance property to get a (not necessarily unqiue) ranking.
69+
Abuses concordance property to get a (not necessarily unique) ranking.
7070
The lack of uniqueness is due to the potential existence of multiple
7171
equally ranked winners. We have to pick one, which is where
7272
the non-uniqueness comes from

0 commit comments

Comments
 (0)