Skip to content

Commit

Permalink
feat: Data insight (#444)
Browse files Browse the repository at this point in the history
* huge update

* update unit test

* update

* update

* format

* special logic when length of dataset is 0

* edge case

* license
  • Loading branch information
Dobiichi-Origami authored Apr 13, 2024
1 parent 994c96e commit 46117a6
Show file tree
Hide file tree
Showing 16 changed files with 1,535 additions and 176 deletions.
1 change: 1 addition & 0 deletions cookbook/assets/essay.json

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ pyarrow = [
{ version = ">=14.0.1", python = ">=3.8", optional = true },
{ version = "<=12.0.1", python = ">=3.7 <3.8", optional = true }
]
tabulate = { version = "*", optional = true }
python-dateutil = { version = "^2.8.2", optional = true }
rich = ">=13.0.0"
typer = ">=0.9.0"
Expand Down Expand Up @@ -86,11 +87,11 @@ papermill = {version = "*", python = ">=3.8 <4"}
ipykernel = {version = "^6.29.4", python = ">=3.8 <4"}

[tool.poetry.extras]
dataset_base = ["numpy", "pyarrow", "python-dateutil", "clevercsv", "ijson"]
langchain = ["numpy", "pyarrow", "python-dateutil", "clevercsv", "ijson", "langchain"]
local_data_clean = ["numpy", "pyarrow", "python-dateutil", "clevercsv", "ijson", "ltp", "emoji", "sentencepiece", "torch"]
openai = ["numpy", "pyarrow", "python-dateutil", "clevercsv", "ijson", "fastapi", "uvicorn"]
extension = ["numpy", "pyarrow", "python-dateutil", "clevercsv", "ijson", "langchain", "ltp", "emoji", "sentencepiece", "torch", "fastapi", "uvicorn"]
dataset_base = ["numpy", "pyarrow", "tabulate", "python-dateutil", "clevercsv", "ijson"]
langchain = ["numpy", "pyarrow", "tabulate", "python-dateutil", "clevercsv", "ijson", "langchain"]
local_data_clean = ["numpy", "pyarrow", "tabulate", "python-dateutil", "clevercsv", "ijson", "ltp", "emoji", "sentencepiece", "torch"]
openai = ["numpy", "pyarrow", "tabulate", "python-dateutil", "clevercsv", "ijson", "fastapi", "uvicorn"]
extension = ["numpy", "pyarrow", "tabulate", "python-dateutil", "clevercsv", "ijson", "langchain", "ltp", "emoji", "sentencepiece", "torch", "fastapi", "uvicorn"]

[tool.ruff.lint]
extend-select = ["PLW1514"]
Expand Down
13 changes: 13 additions & 0 deletions python/qianfan/dataset/data_insight/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
80 changes: 80 additions & 0 deletions python/qianfan/dataset/data_insight/assets/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
<!--
Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Ant Design Table Example</title>
<!-- 引入 Ant Design 的 CSS 文件 -->
<link rel="stylesheet" href="https://cdn.bootcdn.net/ajax/libs/antd/4.18.0/antd.min.css">
</head>
<body>
<!-- 定义一个容器用于渲染表格 -->
<div id="app"></div>

<!-- 引入 Ant Design 的 JavaScript 文件 -->
<script src="https://cdn.bootcdn.net/ajax/libs/react/17.0.2/umd/react.production.min.js"></script>
<script src="https://cdn.bootcdn.net/ajax/libs/react-dom/17.0.2/umd/react-dom.production.min.js"></script>
<script src="https://cdn.bootcdn.net/ajax/libs/babel-standalone/6.26.0/babel.min.js"></script>
<script src="https://cdn.bootcdn.net/ajax/libs/antd/4.18.0/antd.min.js"></script>

<!-- 定义表格的列和数据 -->
<script type="text/babel">
const { Table } = antd;

const columns = [
{
title: 'title',
dataIndex: 'title',
key: 'title',
},
{
title: 'content',
dataIndex: 'content',
key: 'content',
},
];

const data = [
{
key: '1',
title: 'John Brown',
content: '阅读下面的材料,根据要求写作。\n2000年 农历庚辰龙年,人类迈进'
},
{
key: '2',
name: 'Jim Green',
age: 42,
address: 'London No. 1 Lake Park',
},
{
key: '3',
name: 'Joe Black',
age: 32,
address: 'Sidney No. 1 Lake Park',
},
];

// 在容器中渲染表格
ReactDOM.render(
<Table columns={columns} dataSource={data} />,
document.getElementById('app')
);
</script>
</body>
</html>
23 changes: 23 additions & 0 deletions python/qianfan/dataset/data_insight/data_insight_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
utilities of data insight
"""
from typing import Any, Callable, Dict, Generator, List


def _get_generator(iteration_list: List[Dict[str, Any]], func: Callable) -> Generator:
for single in iteration_list:
yield func(single)
251 changes: 251 additions & 0 deletions python/qianfan/dataset/data_insight/insight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""data insight"""

import math
from typing import Any, Callable, Dict, Generator, List, Optional, Set, Union

from pydantic import BaseModel, Field

from qianfan.dataset import Dataset
from qianfan.dataset.data_insight.data_insight_utils import (
_get_generator,
)
from qianfan.dataset.local_data_operators.consts import _default_special_characters_set
from qianfan.dataset.local_data_operators.utils import (
SentencePieceTokenizer,
get_augmentation_word_list,
get_words_from_document,
)
from qianfan.dataset.local_data_operators.word_list import _flagged_words


def get_content_length_for_each_entry(
entry: Union[List[Dict[str, Any]], Dict[str, Any], str],
column: Optional[str] = None,
**kwargs: Any,
) -> Union[Generator, Dict[str, Any]]:
length_column_name = "content_length"

if isinstance(entry, str):
return {length_column_name: len(entry)}

assert column is not None
if isinstance(entry, list):

def _generator(
entry: List[Dict[str, Any]]
) -> Generator[Dict[str, Any], None, None]:
for elem in entry:
yield {length_column_name: len(elem[column])}

return _generator(entry)

elif isinstance(entry, dict):
return {length_column_name: len(entry[column])}
else:
raise ValueError(f"unexpected entry type: {type(entry)}")


def get_character_repetition_ratio(
entry: Union[List[Dict[str, Any]], Dict[str, Any], str],
column: Optional[str] = None,
character_repetition_length: int = 10,
**kwargs: Any,
) -> Union[Generator, Dict[str, Any]]:
def _calculate_single(single: Union[Dict[str, Any], str]) -> Dict[str, Any]:
if isinstance(single, str):
document = single
else:
assert column
document = single[column]

def _get_freq_character_ngrams(content: str, n: int) -> Dict[str, int]:
character_ngrams = [content[i : i + n] for i in range(len(content) - n + 1)]
frequency_ngrams: Dict[str, int] = {}
for character_ngram in character_ngrams:
frequency_ngrams[character_ngram] = (
frequency_ngrams.get(character_ngram, 0) + 1
)
return frequency_ngrams

freq_character_ngrams = _get_freq_character_ngrams(
document, character_repetition_length
)

if len(freq_character_ngrams) == 0:
character_repetition_ratio = 0.0
else:
freq_character_ngram_values = list(freq_character_ngrams.values())
freq_character_ngram_values = sorted(
freq_character_ngram_values, reverse=True
)

val_one = len([el for el in freq_character_ngram_values if el == 1])
num_rep_character_ngrams = min(
int(math.sqrt(len(freq_character_ngram_values))),
len(freq_character_ngram_values) - val_one,
)
character_repetition_ratio = sum(
freq_character_ngram_values[:num_rep_character_ngrams]
) / sum(freq_character_ngram_values)

return {"character_repetition_ratio": character_repetition_ratio}

if isinstance(entry, list):
return _get_generator(entry, _calculate_single)
else:
return _calculate_single(entry)


def get_special_characters_ratio(
entry: Union[List[Dict[str, Any]], Dict[str, Any], str],
column: Optional[str] = None,
special_character_set: Set[str] = _default_special_characters_set,
**kwargs: Any,
) -> Union[Generator, Dict[str, Any]]:
def _calculate_single(single: Union[Dict[str, Any], str]) -> Dict[str, Any]:
if isinstance(single, str):
document = single
else:
assert column
document = single[column]

if len(document) == 0:
special_characters_ratio = 0.0
else:
special_characters_ratio = len(
[char for char in document if char in special_character_set]
) / len(document)

return {"special_characters_ratio": special_characters_ratio}

if isinstance(entry, list):
return _get_generator(entry, _calculate_single)
else:
return _calculate_single(entry)


def get_flagged_word_ratio(
entry: Union[List[Dict[str, Any]], Dict[str, Any], str],
sentence_piece_model_path: str,
column: Optional[str] = None,
words_augmentation_group_sizes: List[int] = [2],
stripped_charset: Set[str] = _default_special_characters_set,
flagged_set: Set[str] = set(_flagged_words.get("zh", [])),
**kwargs: Any,
) -> Union[Generator, Dict[str, Any]]:
sentence_piece_model = SentencePieceTokenizer(sentence_piece_model_path)

def _calculate_single(single: Union[Dict[str, Any], str]) -> Dict[str, Any]:
if isinstance(single, str):
document = single
else:
assert column
document = single[column]

words = get_words_from_document(
document,
"ZH",
sentence_piece_tokenizer=sentence_piece_model,
need_to_lower=False,
strip_characters=stripped_charset,
)

if not words:
flagged_words_ratio = 0.0
else:
augmentation: List[str] = []
if len(words_augmentation_group_sizes) > 0:
augmentation = get_augmentation_word_list(
words,
words_augmentation_group_sizes,
"",
)

flagged_words_ratio = len(
[word for word in words + augmentation if word in flagged_set]
) / len(words)

if flagged_words_ratio > 1.0:
flagged_words_ratio = 1.0

return {"flagged_words_ratio": flagged_words_ratio}

if isinstance(entry, list):
return _get_generator(entry, _calculate_single)
else:
return _calculate_single(entry)


class DatasetInsight(BaseModel):
operator_list: List[
Callable[
[Union[List[Dict[str, Any]], Dict[str, Any], str]],
Union[Generator, Dict[str, Any]],
]
] = Field(
default=[
get_content_length_for_each_entry,
get_character_repetition_ratio,
get_special_characters_ratio,
]
)

def insight(
self, ds: Dataset, column: Optional[str] = None, **kwargs: Any
) -> Dataset:
def _closure(
entry: Union[List[Dict[str, Any]], Dict[str, Any], str]
) -> Union[Generator, Dict[str, Any]]:
result_list: List = [
operator(entry, column=column, **kwargs) # type: ignore
for operator in self.operator_list
]

# 需要兼容,因为这里有可能是返回迭代器(针对 List 的情况)
if isinstance(result_list[0], Generator):
# 这里返回一个生成器函数而不是直接 Yield 的原因是
# 防止 Python 解释器将整个外围函数当成生成器

assert isinstance(entry, list)
assert column is not None

def _generator() -> Generator:
index = 0
while True:
return_dict: Dict[str, Any] = {}
for generator in result_list:
return_dict.update(next(generator))

return_dict[column] = entry[index][column]
index += 1

yield return_dict

return _generator()
else:
return_dict: Dict[str, Any] = {}
for result in result_list:
return_dict.update(result)

column_name = column if column else "content"
column_value = (
entry[column] if column and isinstance(entry, dict) else entry
)
return_dict[column_name] = column_value

return return_dict

return ds.map(_closure, should_create_new_obj=True)
Loading

0 comments on commit 46117a6

Please sign in to comment.