fix typos (LAION-AI#3216)

RainRat · web-flow · commit 27033a5464d7 · 2023-05-23T15:04:51.000+02:00
fix typos
-"try and" isn't correct grammar but I'm willing to leave it be if
requested.
diff --git a/backend/README.md b/backend/README.md
@@ -78,7 +78,7 @@ Note: The api docs should be automatically updated by the
 
 ## Running Celery Worker(s) for API and periodic tasks
 
-Celery workers are used for Huggiface API calls like toxicity and feature
+Celery workers are used for Huggingface API calls like toxicity and feature
 extraction. Celery Beat along with worker is used for periodic tasks like user
 streak update
 
diff --git a/backend/oasst_backend/prompt_repository.py b/backend/oasst_backend/prompt_repository.py
@@ -253,7 +253,7 @@ def store_text_reply(
                     )
                 if not ts.active:
                     logger.warning(
-                        f"Received messsage for inactive tree {parent_message.message_tree_id} (state='{ts.state.value}')."
+                        f"Received message for inactive tree {parent_message.message_tree_id} (state='{ts.state.value}')."
                     )
 
             if check_duplicate and not settings.DEBUG_ALLOW_DUPLICATE_TASKS:
diff --git a/backend/sql_snippets.md b/backend/sql_snippets.md
@@ -2,7 +2,7 @@
 
 Here are find some SQL queries to inspect the current OA postgres DB.
 
-# Baics Stats
+# Basic Stats
 
 ```sql
 -- tables row counts
diff --git a/data/datasets/bart_searchgpt_wiki_nlp_augment/README.md b/data/datasets/bart_searchgpt_wiki_nlp_augment/README.md
@@ -17,11 +17,11 @@ Related to [Issue #2004](https://github.com/LAION-AI/Open-Assistant/issues/2004)
 - OA format data (BART-based):
   https://huggingface.co/datasets/michaelthwan/oa_wiki_qa_bart_10000row
 
-### Syntheic data based on BART
+### Synthetic data based on BART
 
 ![wiki_augment_bart](./img/wiki_augment_bart.png)
 
-### Syntheic data based on SearchGPT
+### Synthetic data based on SearchGPT
 
 ![wiki_augment_searchgpt](./img/wiki_augment_searchgpt.png)
 
diff --git a/data/datasets/gutenberg/README.md b/data/datasets/gutenberg/README.md
@@ -33,7 +33,7 @@ size_categories:
   - 1K<n<10K
 ---
 
-# Dataset Card for Project Gutenber - Multilanguage eBooks
+# Dataset Card for Project Gutenberg - Multilanguage eBooks
 
 A collection of 7907 non-english (about 75-80% of all the ES, DE, FR, NL, IT,
 PT, HU books available on the site) and 48 285 english (80%+) language ebooks
@@ -103,7 +103,7 @@ metadata columns).
   https://www.gutenberg.org/help/copyright.html and
   https://www.gutenberg.org/policy/permission.html
 - Project Gutenberg has the following requests when using books without
-  metadata: _Books obtianed from the Project Gutenberg site should have the
+  metadata: _Books obtained from the Project Gutenberg site should have the
   following legal note next to them: "This eBook is for the use of anyone
   anywhere in the United States and most other parts of the world at no cost and
   with almost" no restrictions whatsoever. You may copy it, give it away or
diff --git a/data/datasets/oa_leet10k/README.md b/data/datasets/oa_leet10k/README.md
@@ -6,7 +6,7 @@ Here we convert oa_leet10k dataset to be uploaded to huggingface.
 
 Takes this Kaggle dataset 'leetcode-solutions'
 https://www.kaggle.com/datasets/erichartford/leetcode-solutions, and turns them
-into basic dialogue using a preset list of user prompt tempaltes.
+into basic dialogue using a preset list of user prompt templates.
 
 ### Some ideas for extending this dataset
 
diff --git a/data/datasets/oa_stackexchange/README.md b/data/datasets/oa_stackexchange/README.md
@@ -39,9 +39,9 @@ This dataset is taken from https://archive.org/details/stackexchange.
 
 There's a single parquet file combining all stackexchange sites. The threads
 have been filtered as follows: only threads with an accepted answer, for which
-both the question and response is less than 1000 characters have been choosen.
+both the question and response is less than 1000 characters have been chosen.
 Other answers, or questions without accepted answers, or long entries have been
-droppped.
+dropped.
 
 Each row consists of
 
diff --git a/data/datasets/poetry_instruction/README.md b/data/datasets/poetry_instruction/README.md
@@ -1,7 +1,7 @@
 Dataset Description This dataset contains around 14,000 poems from the
-PoetryFoundation.org site. They are converted to question:response pairs,
-usingthe tags as topics. 5% of the dataset is titling requests -- the user
-provides apoem and asks the assistant to title it.
+PoetryFoundation.org site. They are converted to question:response pairs, using
+the tags as topics. 5% of the dataset is titling requests -- the user provides a
+poem and asks the assistant to title it.
 
 It can be found here, on my HuggingFace -
 https://huggingface.co/datasets/checkai/instruction-poems
diff --git a/data/datasets/recipes/README.md b/data/datasets/recipes/README.md
@@ -9,12 +9,12 @@ dataset to be uploaded to huggingface.
 Takes this Kaggle dataset 'Recipes from Tasty'
 https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json,
 filters for the top 1,000 highest rated recipes, and turns them into basic
-dialogue using a preset list of user prompt tempaltes.
+dialogue using a preset list of user prompt templates.
 
 ### Some ideas for extending this dataset
 
 This dataset is nicely structured, and the ingredients section includes the
 quantities and units separated out. Some, but not all already include a
 primary_unit (US) and metric_unit. We could find all recipes with both units and
 generate dialogue for the prompt 'convert the ingredients into metric', 'what
-are the ingredients in UK measurments'? etc..
+are the ingredients in UK measurements'? etc..
diff --git a/discord-bots/oa-bot-js/src/modules/open-assistant/langs.ts b/discord-bots/oa-bot-js/src/modules/open-assistant/langs.ts
@@ -83,7 +83,7 @@ export const getLocaleDisplayName = (
   locale: string,
   displayLocale = undefined
 ) => {
-  // Intl defaults to English for locales that are not oficially translated
+  // Intl defaults to English for locales that are not officially translated
   if (missingDisplayNamesForLocales[locale]) {
     return missingDisplayNamesForLocales[locale];
   }
diff --git a/docker/grafana/README.md b/docker/grafana/README.md
@@ -1,7 +1,7 @@
 # Grafana
 
 [Grafana](https://github.com/grafana/grafana) is used to visualize custom
-observabiltiy metrics and much more.
+observability metrics and much more.
 
 This folder contains various configuration files for Grafana.
 
diff --git a/docker/netdata/README.md b/docker/netdata/README.md
@@ -2,7 +2,7 @@
 
 [Netdata](https://github.com/netdata/netdata) is an open source monitoring tool.
 
-This folder contains some configfuration files used to set up various netdata
+This folder contains some configuration files used to set up various netdata
 collectors we want to use like Redis, Postgres, etc.
 
 - [`./go.d/postgres.conf`](./go.d/postgres.conf) - Config for Netdata
diff --git a/docker/prometheus/README.md b/docker/prometheus/README.md
@@ -3,7 +3,7 @@
 [Prometheus](https://github.com/prometheus/prometheus) is an open source
 monitoring system.
 
-This folder contains some configfuration files used to set up Prometheus.
+This folder contains some configuration files used to set up Prometheus.
 
 - [`./prometheus.yml`](./prometheus.yml) - Config for Prometheus, including what
   `/metrics` endpoints to scrape.
diff --git a/docs/docs/tasks/label_prompter_reply.md b/docs/docs/tasks/label_prompter_reply.md
@@ -1,7 +1,7 @@
 # Classifying an initial prompt or user reply
 
 In this task, you'll be shown a random message written by another person. This
-message is mimicing a request or question directed towards the assistant - a
+message is mimicking a request or question directed towards the assistant - a
 **prompt**. This prompt could either be a start of a conversation, or a reply to
 a message from the assistant. Your job is to rate parameters like quality or
 politeness, as well as include any applicable labels, such as spam, PII or
diff --git a/model/model_eval/manual/sampling_report.py b/model/model_eval/manual/sampling_report.py
@@ -243,9 +243,9 @@ def parse_args():
         "--prompts", type=str, help="jsonl string prompts input file name", default="./data/en_100_text.jsonl.gz"
     )
     parser.add_argument("--report", type=str, help="json sampling report output file name")
-    parser.add_argument("--seed", type=int, default="42", help="psoudo random number generator seed")
+    parser.add_argument("--seed", type=int, default="42", help="pseudo random number generator seed")
     parser.add_argument("--verbose", action="store_true", default=False)
-    parser.add_argument("-n", type=int, help="number of promtps to use (default: all)")
+    parser.add_argument("-n", type=int, help="number of prompts to use (default: all)")
     parser.add_argument("--num-samples", type=int, default=2, help="number of sampling runs per configuration")
     parser.add_argument("--config", type=str, default="config/default.json", help="configuration file path")
     parser.add_argument("--half", action="store_true", default=False, help="use float16")
diff --git a/model/model_training/configs/config_rl.yaml b/model/model_training/configs/config_rl.yaml
@@ -74,7 +74,7 @@ llama_rlhf:
     quantization: false
     seq2seqmodel: false
     freeze_layer: 52
-    num_layers_unfrozen: -1 # we dont use this, trlx has its own implementation
+    num_layers_unfrozen: -1 # we don't use this, trlx has its own implementation
     residual_dropout: 0.0
     use_flash_attention: true
     dtype: fp16
diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py
@@ -335,7 +335,7 @@ def __init__(self, cache_dir) -> None:
                 data = json.loads(line)
                 joke = data["joke"]
                 # DO NOT change this
-                # its the data that had syntax error
+                # it's the data that had syntax error
                 explanation = data["explaination"]
                 self.pairs.append(create_dataset_entry_qa(mode="sft", questions=[joke], answers=[explanation]))
 
diff --git a/notebooks/data-augmentation/anthropic/README.md b/notebooks/data-augmentation/anthropic/README.md
@@ -16,7 +16,7 @@ dataset = load_dataset("shahules786/prosocial_augmented")
 ## Steps
 
 1. Use prosocial dialog dataset to train a
-   [safety label classifer](https://huggingface.co/shahules786/prosocial-classifier).
+   [safety label classifier](https://huggingface.co/shahules786/prosocial-classifier).
 2. Finding Rules of thumbs(rots) present in prosocial dataset that matches
    task_description in red-teaming data.
 3. Use pretrained safety-classifier to predict safety labels for the selected
diff --git a/notebooks/data-augmentation/essay-instructions/README.md b/notebooks/data-augmentation/essay-instructions/README.md
@@ -8,4 +8,4 @@ collecting for the model
 
 Feel free to contribute to this notebook, it's nowhere near perfect but it's a
 good start. If you want to contribute finding a new model that better suits this
-task would be great. Hugginface has a lot of models that could help.
+task would be great. Huggingface has a lot of models that could help.
diff --git a/notebooks/data-augmentation/wikidata-qa/README.md b/notebooks/data-augmentation/wikidata-qa/README.md
@@ -9,7 +9,7 @@ answer pair is necessary!
 
 A step-by-step guide:
 
-1. Create a WikiGraph crawler instance and define a cache file to avoide
+1. Create a WikiGraph crawler instance and define a cache file to avoid
    redownloading nodes (only English is supported at the moment)
 
 ```Python
diff --git a/notebooks/detoxify-evaluation/README.md b/notebooks/detoxify-evaluation/README.md
@@ -43,13 +43,13 @@ Detoxify was tested on 4 different types of inputs
 
 ### Sentences used for testing and rating are contained inside the .ipynb
 
-|  Model name  |                                      Not obviously toxic                                       |                         Not obviously non-toxic                         |                         Obviously toxic                          | Obviously non-toxic |
-| :----------: | :--------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------: | :--------------------------------------------------------------: | :-----------------: |
-|   original   | failed at all, easily accepted racist, sexist overally toxic prompts that were well formulated |       Very sensitive on swear words, failed to reckognize context       |                         good performance                         |  good performance   |
-|   unbiased   |                 Managed to find some hidden toxicity but not on all sentences                  | Very sensitive explicit language but shown ability to recognize context | Did well but failed to reckognize some gender stereotype mockery |  good performance   |
-| multilingual |                 Managed to find some hidden toxicity but not on all sentences                  | Very sensitive explicit language but shown ability to recognize context | Did well but failed to reckognize some gender stereotype mockery |  good performance   |
+|  Model name  |                                      Not obviously toxic                                       |                         Not obviously non-toxic                         |                         Obviously toxic                         | Obviously non-toxic |
+| :----------: | :--------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------: | :-------------------------------------------------------------: | :-----------------: |
+|   original   | failed at all, easily accepted racist, sexist overally toxic prompts that were well formulated |       Very sensitive on swear words, failed to recognize context        |                        good performance                         |  good performance   |
+|   unbiased   |                 Managed to find some hidden toxicity but not on all sentences                  | Very sensitive explicit language but shown ability to recognize context | Did well but failed to recognize some gender stereotype mockery |  good performance   |
+| multilingual |                 Managed to find some hidden toxicity but not on all sentences                  | Very sensitive explicit language but shown ability to recognize context | Did well but failed to recognize some gender stereotype mockery |  good performance   |
 
-Subjectivly 'unbiased' looks like the best performing model.
+Subjectively 'unbiased' looks like the best performing model.
 
 I don't think it would do well as a security layer in a live version of open
 assistant unless we do some finetuning first, because it can be fooled to pass
diff --git a/notebooks/example/README.md b/notebooks/example/README.md
@@ -3,7 +3,7 @@
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/example/example.ipynb)
 
 This folder contains an example reference notebook structure and approach for
-this project. Please try and follow this structure as closely as possible. While
+this project. Please try to follow this structure as closely as possible. While
 things will not exactly be the same for each notebook some principles we would
 like to try ensure are:
 
@@ -34,7 +34,7 @@ same directory that the notebook lives in so relative links etc should work as
 expected (for example `example.ipynb` will read some sample data from
 `data/data.csv`).
 
-If you are adding a notebook please try and add a similar cell to the top of the
+If you are adding a notebook please try to add a similar cell to the top of the
 notebook so that it is easy for others to run the notebook in colab. If your
 notebook does not have any dependencies beyond what already comes as standard in
 Google Colab then you do not need such a cell, just an "Open in Colab" badge
diff --git a/oasst-data/README.md b/oasst-data/README.md
@@ -145,7 +145,7 @@ messages are those which have a `review_result` that is `false`.
 
 Conversation threads are a linear lists of messages. THese objects can be
 identified by the presence of the `"thread_id"` property which contains the UUID
-of the last messsage of the the thread (which can be used to reconstruct the
+of the last message of the the thread (which can be used to reconstruct the
 thread by returning the list of ancestor messages up to the prompt root
 message). The message_id of the first message is normally also the id of the
 message-tree that contains the thread.
diff --git a/oasst-shared/README.md b/oasst-shared/README.md
@@ -1,3 +1,3 @@
-# Shared Python code for Open Assisstant
+# Shared Python code for Open Assistant
 
 Run `pip install -e .` to install the package in editable mode.
diff --git a/scripts/data-collection/twitter/README.md b/scripts/data-collection/twitter/README.md
@@ -38,8 +38,8 @@ conversation, or at least as a prompt with replies.
   files for future processing. Note: Using polars instead of pandas due to
   performance reasons.
 - Wrote scripts that process the large dump of tweets into conversation threads
-  using the tree and node architecture. This results in aroun 17K conversation
-  threads bassed on a dump of 90M tweets.
+  using the tree and node architecture. This results in around 17K conversation
+  threads based on a dump of 90M tweets.
 - Script can output the conversation threads into a jsonl file for further
   filtering or use in models.
 
diff --git a/scripts/postprocessing/scoring.py b/scripts/postprocessing/scoring.py
@@ -76,7 +76,7 @@ def score_update_votes(new_vote: int, consensus: npt.ArrayLike, voter_data: Vote
     consensus_ranking = np.argsort(np.argsort(consensus))
     new_points = consensus_ranking[new_vote] + voter_data.voting_points
 
-    # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus,
+    # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the consensus,
     # it's a good vote
     new_good_votes = int(consensus_ranking[new_vote] > (len(consensus) - 1) / 2) + voter_data.num_good_votes
     new_num_votes = voter_data.num_votes + 1
@@ -105,7 +105,7 @@ def score_update_prompts(consensus: npt.ArrayLike, voter_data: Voter) -> Voter:
     delta_votes = np.sum(consensus_ranking * consensus / sum(consensus))
     new_points = delta_votes + voter_data.prompt_points
 
-    # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus,
+    # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the consensus,
     # it's a good vote
     new_good_prompts = int(delta_votes > 0) + voter_data.num_good_prompts
     new_num_prompts = voter_data.num_prompts + 1
diff --git a/website/src/components/Chat/ChatListItem.tsx b/website/src/components/Chat/ChatListItem.tsx
@@ -75,7 +75,7 @@ export const ChatListItem = ({
 
   return (
     <Button
-      // @ts-expect-error error due to dynamicly changing as prop
+      // @ts-expect-error error due to dynamically changing as prop
       ref={rootRef}
       {...(!isEditing ? { as: Link, href: ROUTES.CHAT(chat.id) } : { as: "div" })}
       variant={isActive ? "solid" : "ghost"}
diff --git a/website/src/components/Messages/MessageTableEntry.tsx b/website/src/components/Messages/MessageTableEntry.tsx
@@ -165,12 +165,12 @@ export const MessageTableEntry = forwardRef<HTMLDivElement, MessageTableEntryPro
           )}
           {message.deleted && isAdminOrMod && (
             <Badge colorScheme="red" textTransform="capitalize">
-              Deleted {/* dont translate, it's admin only feature */}
+              Deleted {/* don't translate, it's admin only feature */}
             </Badge>
           )}
           {message.review_result === false && isAdminOrMod && (
             <Badge colorScheme="yellow" textTransform="capitalize">
-              Spam {/* dont translate, it's admin only feature */}
+              Spam {/* don't translate, it's admin only feature */}
             </Badge>
           )}
         </Flex>
diff --git a/website/src/lib/oasst_api_client.ts b/website/src/lib/oasst_api_client.ts
@@ -420,7 +420,7 @@ export class OasstApiClient {
     return this.get<BackendUser>(`/api/v1/frontend_users/${user.auth_method}/${user.id}`);
   }
 
-  // TODO: add update-able fields eg: enbaled, notes, show_on_leaderboard, etc..
+  // TODO: add update-able fields eg: enabled, notes, show_on_leaderboard, etc..
   upsert_frontend_user(user: BackendUserCore) {
     // the backend does a upsert operation with this call
     return this.post<BackendUser>(`/api/v1/frontend_users/`, user);
diff --git a/website/src/lib/users.ts b/website/src/lib/users.ts
@@ -23,7 +23,7 @@ export const getBackendUserCore = async (id: string): Promise<BackendUserCore> =
 };
 
 /**
- * convert a user object to a canoncial representation used for interacting with the backend
+ * convert a user object to a canonical representation used for interacting with the backend
  * @param user frontend user object, from prisma db
  */
 export const convertToBackendUserCore = <T extends { accounts: Account[]; id: string; name: string }>(
diff --git a/website/src/pages/api/auth/[...nextauth].ts b/website/src/pages/api/auth/[...nextauth].ts
@@ -139,7 +139,7 @@ const authOptions: AuthOptions = {
     async signIn({ user, account, isNewUser }) {
       if (isNewUser && account.provider === "email" && !user.name) {
         // only generate a username if the user is new and they signed up with email and they don't have a name
-        // although the name already assigned in the jwt callback, this is to ensure notthing breaks, and we should never reach here.
+        // although the name already assigned in the jwt callback, this is to ensure nothing breaks, and we should never reach here.
         await prisma.user.update({
           data: {
             name: generateUsername(),

Original file line number	Diff line number	Diff line change
`@@ -253,7 +253,7 @@ def store_text_reply(`
`253`	`253`	`)`
`254`	`254`	`if not ts.active:`
`255`	`255`	`logger.warning(`
`256`		`- f"Received messsage for inactive tree {parent_message.message_tree_id} (state='{ts.state.value}')."`
	`256`	`+ f"Received message for inactive tree {parent_message.message_tree_id} (state='{ts.state.value}')."`
`257`	`257`	`)`
`258`	`258`
`259`	`259`	`if check_duplicate and not settings.DEBUG_ALLOW_DUPLICATE_TASKS:`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ export const getLocaleDisplayName = (`
`83`	`83`	`locale: string,`
`84`	`84`	`displayLocale = undefined`
`85`	`85`	`) => {`
`86`		`- // Intl defaults to English for locales that are not oficially translated`
	`86`	`+ // Intl defaults to English for locales that are not officially translated`
`87`	`87`	`if (missingDisplayNamesForLocales[locale]) {`
`88`	`88`	`return missingDisplayNamesForLocales[locale];`
`89`	`89`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-# Shared Python code for Open Assisstant`
	`1`	`+# Shared Python code for Open Assistant`
`2`	`2`
`3`	`3`	Run `pip install -e .` to install the package in editable mode.
Original file line number	Diff line number	Diff line change
`@@ -420,7 +420,7 @@ export class OasstApiClient {`
`420`	`420`	return this.get<BackendUser>(`/api/v1/frontend_users/${user.auth_method}/${user.id}`);
`421`	`421`	`}`
`422`	`422`
`423`		`- // TODO: add update-able fields eg: enbaled, notes, show_on_leaderboard, etc..`
	`423`	`+ // TODO: add update-able fields eg: enabled, notes, show_on_leaderboard, etc..`
`424`	`424`	`upsert_frontend_user(user: BackendUserCore) {`
`425`	`425`	`// the backend does a upsert operation with this call`
`426`	`426`	return this.post<BackendUser>(`/api/v1/frontend_users/`, user);