forked from LAION-AI/Open-Assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtemplate.py
205 lines (163 loc) · 8.3 KB
/
template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This template serves as a starting point for contributing a dataset to the OpenAssistant repo.
When modifying it for your dataset, look for TODO items that offer specific instructions.
To create a dataset loading script you will create a class and implement 3 methods:
* `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
* `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split.
* `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
Full documentation on writing dataset loading scripts can be found here:
https://huggingface.co/docs/datasets/dataset_script
This template is adapted from the one provided by BigScience's BigBIO library:
https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py
TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset.
"""
import os
from typing import Dict, List, Tuple
import datasets
from .hub import OpenAssistantConfig
# TODO: import the schema (i.e. features) that fits your dataset:
from .hub import
# TODO: Add BibTeX citation where appropriate
_CITATION = """\
@article{,
author = {},
title = {},
journal = {},
volume = {},
year = {},
url = {},
doi = {},
biburl = {},
bibsource = {}
}
"""
# TODO: create a module level variable with your dataset name (should match the script name)
# E.g. The Pile: [dataset_name] --> the_pile
_DATASETNAME = "[dataset_name]"
# TODO: create a pretty display name for your dataset
_DISPLAYNAME = "Dataset Name"
# TODO: Add a description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This dataset is designed for XXX NLP task.
"""
# TODO: Add a link to an official homepage for the dataset here (if possible)
_HOMEPAGE = ""
# TODO: Add the licence for the dataset here (if possible)
# Note that this doesn't have to be a common open source license.
# Some datasets have custom licenses. In this case, simply put the full license terms
# into `_LICENSE`
_LICENSE = ""
# TODO: Add links to the URLs needed to download your dataset files.
# This variable can be a relative path for datasets whose files need to be
# manually downloaded or preprocessed in advance.
# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
# However, if you need to access different files for each config you can have multiple entries in this dict.
# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
_URLS = {
_DATASETNAME: "url or list of urls or relative path like ./data ",
}
# TODO: add supported task by dataset. One dataset may support multiple tasks
_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
# This version doesn't have to be consistent with semantic versioning. Anything that is
# provided by the original dataset as a version goes.
_VERSION = ""
# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
# Append "Dataset" to the class name: ThePile --> ThePileDataset
class NewDataset(datasets.GeneratorBasedBuilder):
"""TODO: Short description of my dataset."""
VERSION = datasets.Version(_VERSION)
# You will be able to load each dataset with
# dataset = datasets.load_dataset('my_dataset')
# TODO: For each dataset, implement a config for each subset;
# If a dataset contains more than one subset, implement a config for EACH of them.
# Each of them should contain:
# - name: should be unique for each dataset config eg. the_pile_[schema_name]
# - version: VERSION
# - description: one line description for the dataset
# - schema: open_assistant_[schema_name]
# - subset_id: subset id is the canonical name for the dataset (eg. the_pile)
# where [schema_name] = (language_modeling)
BUILDER_CONFIGS = [
OpenAssistantConfig(
name=f"{_DATASETNAME}_[schema_name]",
version=VERSION,
description=f"OpenAssistant dataset config for {_DATASETNAME}",
schema_name="[schema_name]",
subset_id=_DATASETNAME,
)
]
DEFAULT_CONFIG_NAME = _DATASETNAME
def _info(self) -> datasets.DatasetInfo:
# TODO: Implement the schema for your dataset here.
raise NotImplementedError()
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If you need to access a config choice, that will be in self.config.name
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
# dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
# Not all datasets have predefined canonical train/val/test splits.
# If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# Whatever you put in gen_kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, "train.jsonl"),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "test.jsonl"),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(data_dir, "dev.jsonl"),
"split": "dev",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
# TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
# NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
if self.config.schema == "[schema_name]":
# TODO: yield (key, example) tuples in the given schema
for key, example in thing:
yield key, example
# This allows you to run your dataloader with `python [dataset_name].py` during development
# TODO: Remove this before making your PR
if __name__ == "__main__":
datasets.load_dataset(__file__)