secretflow.stats.table_statistics 源代码
# Copyright 2022 Ant Group Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union
import pandas as pd
from secretflow.data.vertical import VDataFrame
[文档]def table_statistics(table: Union[pd.DataFrame, VDataFrame]) -> pd.DataFrame:
"""Get table statistics for a pd.DataFrame or VDataFrame.
Args:
table: Union[pd.DataFrame, VDataFrame]
Returns:
table_statistics: pd.DataFrame
including each column's datatype, total_count, count, count_na, min, max,
var, std, sem, skewness, kurtosis, q1, q2, q3, moment_2, moment_3, moment_4,
central_moment_2, central_moment_3, central_moment_4, sum, sum_2, sum_3 and sum_4.
moment_2 means E[X^2].
central_moment_2 means E[(X - mean(X))^2].
sum_2 means sum(X^2).
"""
assert isinstance(
table, (pd.DataFrame, VDataFrame)
), "table must be a pd.DataFrame or VDataFrame"
index = table.columns
result = pd.DataFrame(index=index)
result['datatype'] = table.dtypes
result['total_count'] = table.shape[0]
result['count'] = table.count()
result['count_na'] = table.isna().sum()
result['min'] = table.min(numeric_only=True)
result['max'] = table.max(numeric_only=True)
result['mean'] = table.mean(numeric_only=True)
result['var'] = table.var(numeric_only=True)
result['std'] = table.std(numeric_only=True)
result['sem'] = table.sem(numeric_only=True)
result['skew'] = table.skew(numeric_only=True)
result['kurtosis'] = table.kurtosis(numeric_only=True)
result['q1'] = table.quantile(0.25)
result['q2'] = table.quantile(0.5)
result['q3'] = table.quantile(0.75)
result['moment_2'] = table.select_dtypes('number').pow(2).mean(numeric_only=True)
result['moment_3'] = table.select_dtypes('number').pow(3).mean(numeric_only=True)
result['moment_4'] = table.select_dtypes('number').pow(4).mean(numeric_only=True)
result['central_moment_2'] = (
table.subtract(result['mean'])
.select_dtypes('number')
.pow(2)
.mean(numeric_only=True)
)
result['central_moment_3'] = (
table.subtract(result['mean'])
.select_dtypes('number')
.pow(3)
.mean(numeric_only=True)
)
result['central_moment_4'] = (
table.subtract(result['mean'])
.select_dtypes('number')
.pow(4)
.mean(numeric_only=True)
)
result['sum'] = table.sum(numeric_only=True)
result['sum_2'] = table.select_dtypes('number').pow(2).sum(numeric_only=True)
result['sum_3'] = table.select_dtypes('number').pow(3).sum(numeric_only=True)
result['sum_4'] = table.select_dtypes('number').pow(4).sum(numeric_only=True)
return result