-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathp1.py
140 lines (122 loc) · 5.11 KB
/
p1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import unittest
from io import StringIO
import sys
import pandas as pd
pd.set_option('display.max_columns', None)
def print_type_structure(obj, indent=0):
if isinstance(obj, (list, tuple)):
print(' ' * indent + 'List/Tuple:')
for item in obj:
print_type_structure(item, indent + 4)
elif isinstance(obj, dict):
print(' ' * indent + 'Dictionary:')
for key, value in obj.items():
print(' ' * (indent + 4) + f'Key: {type(key).__name__} # ->"'+key+'", Value:')
print_type_structure(value, indent + 8)
else:
print(' ' * indent + f'Type: {type(obj).__name__}')
def train():
"""
加载和处理情感数据集,并使用GPT-2模型进行训练。
该函数不接受参数,也不返回任何值。
"""
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import matplotlib.pyplot as plt
import pandas as pd
# 加载情感数据集
emotions = load_dataset("emotion")
print(emotions["train"].features)
print(type(emotions))
print_type_structure(emotions)
# 打印数据集的长度
print(len(emotions["train"]))
# class 'datasets.dataset_dict.DatasetDict'>
# class 'datasets.arrow_dataset.Dataset'>
# dataset -> [:] -> dataframe
# class 'pandas.core.frame.DataFrame'>
# 将数据集格式设置为pandas,以便于使用DataFrame进行操作
emotions.set_format("pandas")
# class 'pandas.core.frame.DataFrame'>
# 在Python中,当你看到dataset[:]
# 这样的表达式,它通常表示对名为dataset的序列(如列表、元组、字符串或任何实现了__getitem__和__len__方法的对象)进行切片操作。这里的切片操作[
# :]
# 实际上创建了一个dataset的浅拷贝。
#
# 对于大多数常见的序列类型,dataset[:]
# 将返回一个新序列,该序列包含dataset中的所有元素,但它是独立于原始dataset的新对象。对dataset[:]
# 所做的任何修改都不会影响原始的dataset
df = emotions["train"][:]
print("emotions-train-type:" +str(type(emotions["train"])))
print("df-type:" +str(type(df)))
print(df.head())
# 打印测试集的前5条数据
print(emotions["test"][0:5])
# class 'datasets.features.features.ClassLabel'>
print(type(emotions["train"].features["label"]))
# 将标签转换为可读的字符串形式
# DataFrame.apply()
# 方法用于沿着轴的方向应用一个函数。默认情况下,apply()
# 方法不会修改原始DataFrame,而是返回一个新的对象。
df["label_name"] = df["label"].apply(lambda x: emotions["train"].features["label"].int2str(x))
# 统计并绘制不同情感标签的数量分布
df["label_name"].value_counts().plot(kind="bar")
print(df.head())
emotions.map(tokenizer,batched=True,batch_size=None)
tokenizer()
# 显示情感分布图
plt.title("FIVE EMOTIONS")
# plt.show()
def testDataFrameFeture ():
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print_type_structure(df)
from datasets import Dataset
import numpy as np
def generate_random_data(num_samples, num_features):
# 生成指定数量的样本和特征
data = np.random.rand(num_samples, num_features)
# 添加标签(例如,一个简单的二分类问题)
labels = np.random.randint(0, 2, size=num_samples)
# 将数据和标签组合成字典列表
samples = [{'data': list(row), 'label': label} for row, label in zip(data, labels)]
return samples
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize(batch):
data = batch["train"][:5]["text"]
print_type_structure(data)
return tokenizer(data, padding=True, truncation=True)
def train2():
text = "hello everyone 我是一个中国人"
split_w1 = tokenizer.tokenize(text)
print(tokenizer.convert_tokens_to_string(split_w1))
print(split_w1)
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.encode(text))
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))
print(tokenizer(text,padding=True,truncation=True))
def train3():
from datasets import load_dataset
# 加载情感数据集
emotions = load_dataset("emotion")
emotions.set_format("pandas")
df = emotions["train"][:]
print(df[:5])
df["label_name"] =df["label"].apply(lambda x: emotions["train"].features["label"].int2str(x))
df["label_str"] = df["label"].apply(lambda x: str(x))
print(df[:5])
emotions.reset_format()
print(tokenize(emotions))
# def tokenize(batch):
# pass
# # return tokenizer.
if __name__ == '__main__':
ds = generate_random_data(2,3)
dss = Dataset.from_list(ds)
print(type(dss))
print(ds)
# print(dss.features["label"].int2str(0))
testDataFrameFeture()
train3()