|
@@ -74,7 +74,11 @@ from modelscope.msdatasets import MsDataset
|
|
|
def get_gsm8k_questions(split="train") -> Dataset:
|
|
|
# data = load_dataset('openai/gsm8k', 'main')[split]
|
|
|
data = MsDataset.load('openai-mirror/gsm8k', subset_name='main', split=split)
|
|
|
- print("original datasets for train ->\n",data)
|
|
|
+ # Save original datasets to JSONL
|
|
|
+ with open(f'gsm8k_original_{split}.jsonl', 'w') as f:
|
|
|
+ for item in data:
|
|
|
+ f.write(json.dumps(item) + '\n')
|
|
|
+
|
|
|
data = data.map(lambda x: {
|
|
|
'prompt': [
|
|
|
{'role': 'system', 'content': SYSTEM_PROMPT},
|
|
@@ -82,7 +86,11 @@ def get_gsm8k_questions(split="train") -> Dataset:
|
|
|
],
|
|
|
'answer': extract_hash_answer(x['answer'])
|
|
|
})
|
|
|
- print("format datasets for train ->\n",data)
|
|
|
+
|
|
|
+ # Save formatted datasets to JSONL
|
|
|
+ with open(f'gsm8k_formatted_{split}.jsonl', 'w') as f:
|
|
|
+ for item in data:
|
|
|
+ f.write(json.dumps(item) + '\n')
|
|
|
return data
|
|
|
|
|
|
# Get dataset
|