Fix typos (LAION-AI#1143)

kianmeng · web-flow · commit 1e321a6fca8e · 2023-02-05T20:18:03.000+01:00
Found via `codespell -S .mypy_cache,yarn.lock,*.json,*.ipynb -L
rouge,nam,vie`
diff --git a/ansible/README.md b/ansible/README.md
@@ -1,6 +1,6 @@
 To test the ansible playbook on localhost run
 `ansible-playbook -i test.inventory.ini dev.yaml`.\
-In case you're missing the ansible docker depencency install it with `ansible-galaxy collection install community.docker`.\
+In case you're missing the ansible docker dependency install it with `ansible-galaxy collection install community.docker`.\
 Point Redis Insights to the Redis database by visiting localhost:8001 in a
 browser and select "I already have a database" followed by "Connect to a Redis
 Database".\
diff --git a/backend/oasst_backend/api/v1/hugging_face.py b/backend/oasst_backend/api/v1/hugging_face.py
@@ -18,7 +18,7 @@ async def get_text_toxicity(
 
     Args:
         msg (str): the message that we want to analyze.
-        api_client (ApiClient, optional): authentification of the user of the request.
+        api_client (ApiClient, optional): authentication of the user of the request.
             Defaults to Depends(deps.get_trusted_api_client).
 
     Returns:
diff --git a/backend/oasst_backend/config.py b/backend/oasst_backend/config.py
@@ -14,7 +14,7 @@ class TreeManagerConfiguration(BaseModel):
     number is reached."""
 
     max_initial_prompt_review: int = 100
-    """Maximum number of initial prompts under review before no more inital prompt tasks will be handed out."""
+    """Maximum number of initial prompts under review before no more initial prompt tasks will be handed out."""
 
     max_tree_depth: int = 3
     """Maximum depth of message tree."""
@@ -75,7 +75,7 @@ class TreeManagerConfiguration(BaseModel):
 
     min_active_rankings_per_lang: int = 0
     """When the number of active ranking tasks is below this value when a tree enters a terminal
-    state an available trees in BACKLOG_RANKING will be actived (i.e. enters the RANKING state)."""
+    state an available trees in BACKLOG_RANKING will be activated (i.e. enters the RANKING state)."""
 
     labels_initial_prompt: list[TextLabel] = [
         TextLabel.spam,
diff --git a/backend/oasst_backend/models/message_tree_state.py b/backend/oasst_backend/models/message_tree_state.py
@@ -12,7 +12,7 @@ class State(str, Enum):
     """States of the Open-Assistant message tree state machine."""
 
     INITIAL_PROMPT_REVIEW = "initial_prompt_review"
-    """In this state the message tree consists only of a single inital prompt root node.
+    """In this state the message tree consists only of a single initial prompt root node.
     Initial prompt labeling tasks will determine if the tree goes into `growing` or
     `aborted_low_grade` state."""
 
@@ -33,11 +33,11 @@ class State(str, Enum):
     compute the aggergated ranking scores that will appear in the dataset."""
 
     READY_FOR_EXPORT = "ready_for_export"
-    """The Scoring algorithm computed rankings scores for all childern. The message tree can be
+    """The Scoring algorithm computed rankings scores for all children. The message tree can be
     exported as part of an Open-Assistant message tree dataset."""
 
     SCORING_FAILED = "scoring_failed"
-    """An exception occured in the scoring algorithm."""
+    """An exception occurred in the scoring algorithm."""
 
     ABORTED_LOW_GRADE = "aborted_low_grade"
     """The system received too many bad reviews and stopped handing out tasks for this message tree."""
diff --git a/backend/oasst_backend/prompt_repository.py b/backend/oasst_backend/prompt_repository.py
@@ -484,7 +484,7 @@ def store_text_labels(self, text_labels: protocol_schema.TextLabels) -> tuple[Te
                     OasstErrorCode.TASK_PAYLOAD_TYPE_MISMATCH,
                 )
 
-            logger.debug(f"text_labels relpy: {valid_labels=}, {mandatory_labels=}")
+            logger.debug(f"text_labels reply: {valid_labels=}, {mandatory_labels=}")
 
             if valid_labels:
                 if not all([label in valid_labels for label in text_labels.labels.keys()]):
diff --git a/backend/oasst_backend/task_repository.py b/backend/oasst_backend/task_repository.py
@@ -177,7 +177,7 @@ def close_task(self, frontend_message_id: str, allow_personal_tasks: bool = Fals
         if not allow_personal_tasks and not task.collective:
             raise OasstError("This is not a collective task", OasstErrorCode.TASK_NOT_COLLECTIVE)
         if task.done:
-            raise OasstError("Allready closed", OasstErrorCode.TASK_ALREADY_DONE)
+            raise OasstError("Already closed", OasstErrorCode.TASK_ALREADY_DONE)
 
         task.done = True
         self.db.add(task)
diff --git a/backend/oasst_backend/tree_manager.py b/backend/oasst_backend/tree_manager.py
@@ -320,7 +320,7 @@ def _auto_moderation(self, lang: str) -> None:
             if num_red_flag is not None and num_red_flag >= self.cfg.auto_mod_red_flags:
                 if m.parent_id is None:
                     logger.warning(
-                        f"[AUTO MOD] Halting tree {m.message_tree_id}, inital prompt got too many red flags ({m.emojis})."
+                        f"[AUTO MOD] Halting tree {m.message_tree_id}, initial prompt got too many red flags ({m.emojis})."
                     )
                     self.enter_low_grade_state(m.message_tree_id)
                 else:
@@ -895,7 +895,7 @@ def update_message_ranks(
                     logger.warning("The intersection of ranking results ID sets has less than two elements. Skipping.")
                     continue
 
-                # keep only elements in commond set
+                # keep only elements in command set
                 ordered_ids_list = [list(filter(lambda x: x in common_set, ids)) for ids in ordered_ids_list]
                 assert all(len(x) == len(common_set) for x in ordered_ids_list)
 
@@ -1069,7 +1069,7 @@ def query_replies_need_review(self, lang: str) -> list[Message]:
 """
 
     def query_incomplete_rankings(self, lang: str) -> list[IncompleteRankingsRow]:
-        """Query parents which have childern that need further rankings"""
+        """Query parents which have children that need further rankings"""
 
         user_id = self.pr.user_id if not settings.DEBUG_ALLOW_DUPLICATE_TASKS else None
         r = self.db.execute(
@@ -1256,7 +1256,7 @@ def query_tree_ranking_results(
 
     @managed_tx_method(CommitMode.COMMIT)
     def ensure_tree_states(self) -> None:
-        """Add message tree state rows for all root nodes (inital prompt messages)."""
+        """Add message tree state rows for all root nodes (initial prompt messages)."""
 
         missing_tree_ids = self.query_misssing_tree_states()
         for id in missing_tree_ids:
@@ -1598,7 +1598,7 @@ def purge_user_messages(
         total_messages = sum(len(x) for x in replies_by_tree.values())
         logger.debug(f"found: {len(replies_by_tree)} trees; {len(prompts)} prompts; {total_messages} messages;")
 
-        # remove all trees based on inital prompts of the user
+        # remove all trees based on initial prompts of the user
         if purge_initial_prompts:
             for p in prompts:
                 self.purge_message_tree(p.message_tree_id)
@@ -1636,7 +1636,7 @@ def is_descendant_of_deleted(m: Message) -> bool:
                     logger.debug(f"purging message: {m.id}")
                     self._purge_message_internal(m.id)
 
-            # update childern counts
+            # update children counts
             self.pr.update_children_counts(m.message_tree_id)
 
             # reactivate tree
diff --git a/backend/oasst_backend/utils/ranking.py b/backend/oasst_backend/utils/ranking.py
@@ -66,7 +66,7 @@ def get_winner(pairs):
 
 def get_ranking(pairs):
     """
-    Abuses concordance property to get a (not necessarily unqiue) ranking.
+    Abuses concordance property to get a (not necessarily unique) ranking.
     The lack of uniqueness is due to the potential existence of multiple
     equally ranked winners. We have to pick one, which is where
     the non-uniqueness comes from
diff --git a/copilot/README.md b/copilot/README.md
@@ -29,7 +29,7 @@ This will create a variety of aws roles and services needed for deployment.
 copilot deploy
 ```
 
-This will depoy the services but it won't be 100% ready for usage. Before being
+This will deploy the services but it won't be 100% ready for usage. Before being
 ready, we have to inspect the AWS Secrets manager and extract out the database
 credentials. Read those credentials then put them, and a few other secrets, in a
 `secrets.yml` file like the following:
diff --git a/docs/docs/data/schemas.mdx b/docs/docs/data/schemas.mdx
@@ -182,7 +182,7 @@ message GenerationExample {
 
 class RankingExample:
   thread: Thread # The conversation thread before the message to be ranked
-  messages: list[Message] # The messages to be ranked, in oder of decreasing preference
+  messages: list[Message] # The messages to be ranked, in order of decreasing preference
 
 ```
 
diff --git a/docs/docs/data/supervised-datasets.md b/docs/docs/data/supervised-datasets.md
@@ -14,16 +14,16 @@ help.
 There are two large-scale projects in the area of instruction-following /
 multitask learning: Promptsource and Natural Instructions - these projects
 crowdsourced templates and turned existing NLP datasets into
-instruction-following seq2seq form in natural langauge. They include both
+instruction-following seq2seq form in natural language. They include both
 long-output training examples like generating a sentence that is a likely
 consequence of sentence in the prompt, and short-output, like rating prediction
 from review. (Pre-)training on such datasets should help model understand and
-follow instructions and teach it many abilities neccessary to perform a large
-set of tasks correctly. However, these data are not dialog-like - they do not
-look like a normal conversation.
+follow instructions and teach it many abilities necessary to perform a large set
+of tasks correctly. However, these data are not dialog-like - they do not look
+like a normal conversation.
 
 There are also supervised dialog datasets such as Blended Skill Talk or SODA. In
-constrast to instruction-following datasets, dialog data is not as focused on
+contrast to instruction-following datasets, dialog data is not as focused on
 "academic tasks" or correctness, but encourage the model to respond naturally
 like a person would.
 
diff --git a/model/reward/instructor/TODO.md b/model/reward/instructor/TODO.md
@@ -1,6 +1,6 @@
 Some other reward features we can use
 
-0. Finish classifcation feature
+0. Finish classification feature
 
 1. Summaries from human feedback
 
diff --git a/model/reward/instructor/rank_datasets.py b/model/reward/instructor/rank_datasets.py
@@ -14,7 +14,7 @@
     [] support additional negative samples generated from other models.
 
         For example we can use galactica-125m to generate a TLDR and assume it was
-        inferior than the human perference one
+        inferior than the human preference one
 
 
 """
diff --git a/model/supervised_finetuning/custom_datasets/README.md b/model/supervised_finetuning/custom_datasets/README.md
@@ -23,5 +23,5 @@ Issues and TODO:
 - ideally we can update the config yaml and new dataset will be download from
   hub
 
-  - one possible idea is we upload the trasform format of these dataset to the
+  - one possible idea is we upload the transform format of these dataset to the
     OA hub
diff --git a/model/supervised_finetuning/custom_datasets/qa_datasets.py b/model/supervised_finetuning/custom_datasets/qa_datasets.py
@@ -314,7 +314,7 @@ def __init__(self, cache_dir) -> None:
             for line in f:
                 data = json.loads(line)
                 joke = data["joke"]
-                explanation = data["explaination"]
+                explanation = data["explanation"]
                 self.pairs.append((joke, explanation))
 
         if len(question) > 0 and len(answer) > 0:
diff --git a/notebooks/closed-book-qa/T5_closed_book_QA_generators.py b/notebooks/closed-book-qa/T5_closed_book_QA_generators.py
@@ -83,7 +83,7 @@
         try:
             objects.append(pickle.load(openfile))
         except EOFError:
-            print("Problem laoding your pickle file, using the default array")
+            print("Problem loading your pickle file, using the default array")
             pickle_fail = True
             break
 
@@ -92,7 +92,7 @@
     paragraphs = [
         "Like for me, this thing is like a little side hobby, but it's also one that's deeply fulfilling. So not just from a business perspective, which is not the way I think about it. I just think from a life human perspective, it's I probably wouldn't have this kind of conversation with you off mic, like this long, this deep, this attentive. There's something really fulfilling about these conversations. So what advice would you have for me? What advice do you have for yourself? Oh, have you not introspected this that deeply? Oh, I have advice. I think the first advice I would give to you is I think you should have me on more often. Yeah. Yeah. That's first and foremost. And second is go on your podcast and have a conversation. Well, I would say you come on my podcast when you're ready. Yeah. When you feel like the product that I'm putting out would benefit from your presence and vice versa, not as a favor to a bro, but at the right time.",
         "Well, we really are looking through a two dimensional screen until it's what we intuit to be a three dimensional world and also inferring dynamic stuff, making it 4D. Anyway, is it possible to visualize some pretty pictures that give us a deeper sense of the truth of reality? I think that we will incrementally be able to do that. I think that, for example, the picture that we have of electrons and photons interacting and scattering, it may have not been possible until Faraday did all of his experiments and then Maxwell wrote down his equations. And we were then sort of forced by his equations to think in a new way. And then when Planck in 1900, desperate to try to solve the problem of black body radiation, what they call the ultraviolet catastrophe where Newton was predicting infinite energies where there weren't infinite energies in black body radiation. And he in desperation proposed packets of energy. Then once you've done that, and then you have an Einstein come along five years later and show how that explains the photoelectric effect.",
-        "But man, I don't know how I would feel about just bacteria everywhere. Well, it would be depressing if it was true. I suppose depressing, I don't think, I don't. I don't know what's more depressing, bacteria everywhere or nothing everywhere. Yes, either of them are chilling. Yeah. But whether it's chilling or not, I don't think should force us to change our view about whether it's real or not. Yes. And what I'm saying may or may not be true. So how would you feel if we discovered life on Mars? Absolutely. It sounds like you would be less excited than some others because you're like, well. What I would be most interested in is how similar to life on Earth it would be. It would actually turn into quite a subtle problem because the likelihood of life having gone to and fro between Mars and the Earth is quite, I wouldn't say high, but it's not low, it's quite feasible. And so if we found life on Mars and it had very similar genetic code, but it was slightly different, most people would interpret that immediately as evidence that they've been transit one way or the other and that it was a common origin of life on Mars or on the Earth and it went one way or the other way.",
+        "But man, I don't know how I would feel about just bacteria everywhere. Well, it would be depressing if it was true. I suppose depressing, I don't think, I don't. I don't know what's more depressing, bacteria everywhere or nothing everywhere. Yes, either of them are chilling. Yeah. But whether it's chilling or not, I don't think should force us to change our view about whether it's real or not. Yes. And what I'm saying may or may not be true. So how would you feel if we discovered life on Mars? Absolutely. It sounds like you would be less excited than some others because you're like, well. What I would be most interested in is how similar to life on Earth it would be. It would actually turn into quite a subtle problem because the likelihood of life having gone to and from between Mars and the Earth is quite, I wouldn't say high, but it's not low, it's quite feasible. And so if we found life on Mars and it had very similar genetic code, but it was slightly different, most people would interpret that immediately as evidence that they've been transit one way or the other and that it was a common origin of life on Mars or on the Earth and it went one way or the other way.",
     ]
 
 # Make sure no paragraphs are too long for T5. It handles up to 512 tokens context length.
diff --git a/notebooks/data-augmentation/stackexchange-builder/README.md b/notebooks/data-augmentation/stackexchange-builder/README.md
@@ -86,7 +86,7 @@ Each question and all related answers are on a single line in JSONL format.
 #### Table/CSV/Parquet Format
 
 There are a lot more columns left over in the table format. `_q` and `_a` are
-suffixes indiciating if the column came from the question or answer table as
+suffixes indicating if the column came from the question or answer table as
 leftover from a join statement.
 
 ```
diff --git a/notebooks/detoxify-evaluation/README.md b/notebooks/detoxify-evaluation/README.md
@@ -15,7 +15,7 @@ trained on
 | multilingual | xlm-roberta-base  | Multilingual Toxic Comment Classification  |
 
 Unbiased and original models also have a 'small' version - but since normal
-models are not memory heavy, and small models perform noticably worse, they are
+models are not memory heavy, and small models perform noticeably worse, they are
 only described in the notebook
 
 ## All tests below were ran on a 3090TI
diff --git a/notebooks/example/README.md b/notebooks/example/README.md
@@ -7,7 +7,7 @@ this project. Please try and follow this structure as closely as possible. While
 things will not exactly be the same for each notebook some principles we would
 like to try ensure are:
 
-1. Each notebook or collection of related or dependant notebooks should live in
+1. Each notebook or collection of related or dependent notebooks should live in
    its own folder.
 1. Each notebook should have a markdown file with the same name as the notebook
    (or README.md if it's a single notebook folder) that explains what the
diff --git a/oasst-shared/oasst_shared/exceptions/oasst_api_error.py b/oasst-shared/oasst_shared/exceptions/oasst_api_error.py
@@ -95,7 +95,7 @@ class OasstError(Exception):
     http_status_code: HTTPStatus
 
     def __init__(self, message: str, error_code: OasstErrorCode, http_status_code: HTTPStatus = HTTPStatus.BAD_REQUEST):
-        super().__init__(message, error_code, http_status_code)  # make excetpion picklable (fill args member)
+        super().__init__(message, error_code, http_status_code)  # make exception picklable (fill args member)
         self.message = message
         self.error_code = error_code
         self.http_status_code = http_status_code
diff --git a/scripts/data-collection/twitter/README.md b/scripts/data-collection/twitter/README.md
@@ -57,7 +57,7 @@ conversation, or at least as a prompt with replies.
   guarantee of the quality of the tweets.
 - The tweet quality is the other major issue. We can get conversations through
   the currently made scripts, but they most likely don't match a useful
-  instruction -> fulfilment. We are trying to filter the tweets through various
+  instruction -> fulfillment. We are trying to filter the tweets through various
   means such as matching useful hashtags, or by using cosine similarity against
   known instructions.
 - The modern Twitter API has conversation_id as a field which can be a way to
@@ -68,7 +68,7 @@ conversation, or at least as a prompt with replies.
 ## TODO
 
 - Write scripts to filter existing conversations into useful instructions ->
-  fulfilment with hashtags or cosine similarity.
+  fulfillment with hashtags or cosine similarity.
 - Train model to detect if text is a suitable instruction. This could then be
   run through the conversations (or full tweet dump) to simplify the process.
   Related to issue #143.
diff --git a/scripts/data-collection/twitter/twitter_process_json.py b/scripts/data-collection/twitter/twitter_process_json.py
@@ -9,7 +9,7 @@
 
 # This assumes data downloaded from https://archive.org/details/twitterstream
 # and that the internal .tar files are extracted locally.
-# They are large files so using something like 7Zip or WinRar migth be easier
+# They are large files so using something like 7Zip or WinRar might be easier
 # than putting all of it in scripts, but it is a possibility.
 
 # I often work in notebooks. If you encounter any issue, please reach out to let me know.
diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py
@@ -94,7 +94,7 @@ def __init__(self):
     def parse_single(self, essay):
         instructions = []
 
-        # Make stucture error (shuffle one paragraph with another)
+        # Make structure error (shuffle one paragraph with another)
         essay_paragraphs = essay.split("\n\n")  # Splitting a String by newline character (\n)
 
         rand1 = random.randint(0, len(essay_paragraphs) - 1)
@@ -424,7 +424,7 @@ def parse(self, codes):
 
 
 def recognize_entities(text, model, n=4, person="ignore"):
-    """Given a text and a model for entity recognition, return the most occuring entites in the text as a string"""
+    """Given a text and a model for entity recognition, return the most occurring entities in the text as a string"""
     doc = model(text)
     if person == "ignore":
         ents = Counter([ent.text.strip() for ent in list(doc.ents) if len(ent.text.strip()) >= 5])
diff --git a/scripts/postprocessing/rankings.py b/scripts/postprocessing/rankings.py
@@ -66,7 +66,7 @@ def get_winner(pairs):
 
 def get_ranking(pairs):
     """
-    Abuses concordance property to get a (not necessarily unqiue) ranking.
+    Abuses concordance property to get a (not necessarily unique) ranking.
     The lack of uniqueness is due to the potential existence of multiple
     equally ranked winners. We have to pick one, which is where
     the non-uniqueness comes from
diff --git a/scripts/postprocessing/scoring.py b/scripts/postprocessing/scoring.py
diff --git a/scripts/postprocessing/task_schedule.py b/scripts/postprocessing/task_schedule.py
diff --git a/text-frontend/auto_main.py b/text-frontend/auto_main.py
diff --git a/website/cypress/README.md b/website/cypress/README.md
diff --git a/website/src/components/Survey/LabelLikertGroup.tsx b/website/src/components/Survey/LabelLikertGroup.tsx
diff --git a/website/src/hooks/tasks/useGenericTaskAPI.tsx b/website/src/hooks/tasks/useGenericTaskAPI.tsx
diff --git a/website/src/pages/api/update_task.ts b/website/src/pages/api/update_task.ts
diff --git a/website/src/test_pages/README.md b/website/src/test_pages/README.md

Original file line number	Diff line number	Diff line change
`@@ -484,7 +484,7 @@ def store_text_labels(self, text_labels: protocol_schema.TextLabels) -> tuple[Te`
`484`	`484`	`OasstErrorCode.TASK_PAYLOAD_TYPE_MISMATCH,`
`485`	`485`	`)`
`486`	`486`
`487`		`- logger.debug(f"text_labels relpy: {valid_labels=}, {mandatory_labels=}")`
	`487`	`+ logger.debug(f"text_labels reply: {valid_labels=}, {mandatory_labels=}")`
`488`	`488`
`489`	`489`	`if valid_labels:`
`490`	`490`	`if not all([label in valid_labels for label in text_labels.labels.keys()]):`