Data Statistic¶

This component will do some statistical work on the data, including statistical mean, maximum and minimum, median, etc.

The indicators for each column that can be statistic are list as follow.

count: Number of data
sum: The sum of this column
mean: The mean of this column
variance/stddev: Variance and standard deviation of this column
median: Median of this column
min/max: Min and Max value of this column
coefficient of variance: The formula is abs(stddev / mean)
missing_count/missing_ratio: Number and ratio of missing value in this column
skewness: The definition can be referred to here
kurtosis: The definition can be referred to here
percentile: The value of percentile. Accept 0% to 100% while the number before the "%" should be integer.

These static values can be used in feature selection as a criterion.

Param¶

`statistics_param` ¶

Classes¶

`StatisticsParam (BaseParam)` ¶

Define statistics params

Parameters¶

list, string, default "summary"

Specify the statistic types to be computed. "summary" represents list: [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION, consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS]

list of string, default []

Specify columns to be used for statistic computation by column names in header

list of int, default -1

Specify columns to be used for statistic computation by column order in header -1 indicates to compute statistics over all columns

bool, default: True

If False, the calculations of skewness and kurtosis are corrected for statistical bias.

bool, default True

Indicate whether to run this modules

Source code in federatedml/param/statistics_param.py

class StatisticsParam(BaseParam):
    """
    Define statistics params

    Parameters
    ----------
    statistics: list, string, default "summary"
        Specify the statistic types to be computed.
        "summary" represents list: [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
                    consts.MEDIAN, consts.MIN, consts.MAX,
                    consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS]

    column_names: list of string, default []
        Specify columns to be used for statistic computation by column names in header

    column_indexes: list of int, default -1
        Specify columns to be used for statistic computation by column order in header
        -1 indicates to compute statistics over all columns

    bias: bool, default: True
        If False, the calculations of skewness and kurtosis are corrected for statistical bias.

    need_run: bool, default True
        Indicate whether to run this modules
    """

    LEGAL_STAT = [consts.COUNT, consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
                  consts.MEDIAN, consts.MIN, consts.MAX, consts.VARIANCE,
                  consts.COEFFICIENT_OF_VARIATION, consts.MISSING_COUNT,
                  consts.MISSING_RATIO,
                  consts.SKEWNESS, consts.KURTOSIS]
    BASIC_STAT = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
                  consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO,
                  consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS,
                  consts.COEFFICIENT_OF_VARIATION]
    LEGAL_QUANTILE = re.compile("^(100)|([1-9]?[0-9])%$")

    def __init__(self, statistics="summary", column_names=None,
                 column_indexes=-1, need_run=True, abnormal_list=None,
                 quantile_error=consts.DEFAULT_RELATIVE_ERROR, bias=True):
        super().__init__()
        self.statistics = statistics
        self.column_names = column_names
        self.column_indexes = column_indexes
        self.abnormal_list = abnormal_list
        self.need_run = need_run
        self.quantile_error = quantile_error
        self.bias = bias
        if column_names is None:
            self.column_names = []
        if column_indexes is None:
            self.column_indexes = []
        if abnormal_list is None:
            self.abnormal_list = []

    # @staticmethod
    # def extend_statistics(statistic_name):
    #     basic_metrics = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
    #                 consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO,
    #                 consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS,
    #                 consts.COEFFICIENT_OF_VARIATION]
    #     if statistic_name == "summary":
    #         return basic_metrics
    #
    # if statistic_name == "describe":
    #     return [consts.COUNT, consts.MEAN, consts.STANDARD_DEVIATION,
    #             consts.MIN, consts.MAX]

    @staticmethod
    def find_stat_name_match(stat_name):
        if stat_name in StatisticsParam.LEGAL_STAT or StatisticsParam.LEGAL_QUANTILE.match(stat_name):
            return True
        return False

        # match_result = [legal_name == stat_name for legal_name in StatisticsParam.LEGAL_STAT]
        # match_result.append(0 if LEGAL_QUANTILE.match(stat_name) is None else True)
        # match_found = sum(match_result) > 0
        # return match_found

    def check(self):
        model_param_descr = "Statistics's param statistics"
        BaseParam.check_boolean(self.need_run, model_param_descr)
        statistics = copy.copy(self.BASIC_STAT)
        if not isinstance(self.statistics, list):
            if self.statistics in [consts.SUMMARY]:
                self.statistics = statistics
            else:
                if self.statistics not in statistics:
                    statistics.append(self.statistics)
                self.statistics = statistics
        else:
            for s in self.statistics:
                if s not in statistics:
                    statistics.append(s)
            self.statistics = statistics

        for stat_name in self.statistics:
            match_found = StatisticsParam.find_stat_name_match(stat_name)
            if not match_found:
                raise ValueError(f"Illegal statistics name provided: {stat_name}.")

        model_param_descr = "Statistics's param column_names"
        if not isinstance(self.column_names, list):
            raise ValueError(f"column_names should be list of string.")
        for col_name in self.column_names:
            BaseParam.check_string(col_name, model_param_descr)

        model_param_descr = "Statistics's param column_indexes"
        if not isinstance(self.column_indexes, list) and self.column_indexes != -1:
            raise ValueError(f"column_indexes should be list of int or -1.")
        if self.column_indexes != -1:
            for col_index in self.column_indexes:
                if not isinstance(col_index, int):
                    raise ValueError(f"{model_param_descr} should be int or list of int")
                if col_index < -consts.FLOAT_ZERO:
                    raise ValueError(f"{model_param_descr} should be non-negative int value(s)")

        if not isinstance(self.abnormal_list, list):
            raise ValueError(f"abnormal_list should be list of int or string.")

        self.check_decimal_float(self.quantile_error, "Statistics's param quantile_error ")
        self.check_boolean(self.bias, "Statistics's param bias ")
        return True

BASIC_STAT ¶

LEGAL_QUANTILE ¶

LEGAL_STAT ¶

__init__(self, statistics='summary', column_names=None, column_indexes=-1, need_run=True, abnormal_list=None, quantile_error=0.0001, bias=True)

special ¶

Source code in federatedml/param/statistics_param.py

def __init__(self, statistics="summary", column_names=None,
             column_indexes=-1, need_run=True, abnormal_list=None,
             quantile_error=consts.DEFAULT_RELATIVE_ERROR, bias=True):
    super().__init__()
    self.statistics = statistics
    self.column_names = column_names
    self.column_indexes = column_indexes
    self.abnormal_list = abnormal_list
    self.need_run = need_run
    self.quantile_error = quantile_error
    self.bias = bias
    if column_names is None:
        self.column_names = []
    if column_indexes is None:
        self.column_indexes = []
    if abnormal_list is None:
        self.abnormal_list = []

find_stat_name_match(stat_name) staticmethod ¶

Source code in federatedml/param/statistics_param.py

@staticmethod
def find_stat_name_match(stat_name):
    if stat_name in StatisticsParam.LEGAL_STAT or StatisticsParam.LEGAL_QUANTILE.match(stat_name):
        return True
    return False

check(self) ¶

Source code in federatedml/param/statistics_param.py

def check(self):
    model_param_descr = "Statistics's param statistics"
    BaseParam.check_boolean(self.need_run, model_param_descr)
    statistics = copy.copy(self.BASIC_STAT)
    if not isinstance(self.statistics, list):
        if self.statistics in [consts.SUMMARY]:
            self.statistics = statistics
        else:
            if self.statistics not in statistics:
                statistics.append(self.statistics)
            self.statistics = statistics
    else:
        for s in self.statistics:
            if s not in statistics:
                statistics.append(s)
        self.statistics = statistics

    for stat_name in self.statistics:
        match_found = StatisticsParam.find_stat_name_match(stat_name)
        if not match_found:
            raise ValueError(f"Illegal statistics name provided: {stat_name}.")

    model_param_descr = "Statistics's param column_names"
    if not isinstance(self.column_names, list):
        raise ValueError(f"column_names should be list of string.")
    for col_name in self.column_names:
        BaseParam.check_string(col_name, model_param_descr)

    model_param_descr = "Statistics's param column_indexes"
    if not isinstance(self.column_indexes, list) and self.column_indexes != -1:
        raise ValueError(f"column_indexes should be list of int or -1.")
    if self.column_indexes != -1:
        for col_index in self.column_indexes:
            if not isinstance(col_index, int):
                raise ValueError(f"{model_param_descr} should be int or list of int")
            if col_index < -consts.FLOAT_ZERO:
                raise ValueError(f"{model_param_descr} should be non-negative int value(s)")

    if not isinstance(self.abnormal_list, list):
        raise ValueError(f"abnormal_list should be list of int or string.")

    self.check_decimal_float(self.quantile_error, "Statistics's param quantile_error ")
    self.check_boolean(self.bias, "Statistics's param bias ")
    return True

Examples¶

Example

Pipeline

pipeline-data-statistics-partial-column-missing.py

import argparse
import json

from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component.evaluation import Evaluation
from pipeline.component.intersection import Intersection
from pipeline.component.reader import Reader
from pipeline.interface.data import Data
from pipeline.interface.model import Model
from pipeline.component.data_statistics import DataStatistics
from pipeline.utils.tools import load_job_config


def prettify(response, verbose=True):
    if verbose:
        print(json.dumps(response, indent=4, ensure_ascii=False))
        print()
    return response


def main(config="../../config.yaml", namespace=""):
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    hosts = parties.host[0]

    guest_train_data = {"name": "ionosphere_scale_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "ionosphere_scale_hetero_host", "namespace": f"experiment{namespace}"}
    # guest_train_data = {"name": "default_credit_hetero_guest", "namespace": f"experiment{namespace}"}
    # host_train_data = {"name": "default_credit_hetero_host", "namespace": f"experiment{namespace}"}

    # initialize pipeline
    pipeline = PipeLine()
    # set job initiator
    pipeline.set_initiator(role='guest', party_id=guest)
    # set participants information
    pipeline.set_roles(guest=guest, host=hosts)

    # define Reader components to read in data
    reader_0 = Reader(name="reader_0")
    # configure Reader for guest
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    # configure Reader for host
    reader_0.get_party_instance(role='host', party_id=hosts).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", output_format='dense', missing_fill=False)

    # get DataTransform party instance of guest
    data_transform_0_guest_party_instance = data_transform_0.get_party_instance(role='guest', party_id=guest)
    # configure DataTransform for guest
    data_transform_0_guest_party_instance.component_param(with_label=True, label_name="label")
    # get and configure DataTransform party instance of host
    data_transform_0.get_party_instance(role='host', party_id=hosts).component_param(with_label=False)

    # define Intersection components
    intersection_0 = Intersection(name="intersection_0")

    pipeline.add_component(reader_0)

    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))

    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))

    statistic_param = {
        "name": "statistic_0",
        "statistics": ["95%", "coefficient_of_variance", "stddev"],
        "column_indexes": [1, 2],
        "column_names": ["x3"]
    }
    statistic_0 = DataStatistics(**statistic_param)
    pipeline.add_component(statistic_0, data=Data(data=intersection_0.output.data))

    pipeline.compile()

    # fit model
    pipeline.fit()
    # query component summary
    prettify(pipeline.get_component("statistic_0").get_summary())


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

pipeline-data-statistics-all-columns.py

import argparse
import json

from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import Intersection
from pipeline.component import Reader
from pipeline.interface import Data
from pipeline.interface import Model
from pipeline.component import DataStatistics
from pipeline.utils.tools import load_job_config


def prettify(response, verbose=True):
    if verbose:
        print(json.dumps(response, indent=4, ensure_ascii=False))
        print()
    return response


def main(config="../../config.yaml", namespace=""):
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    hosts = parties.host[0]

    guest_train_data = {"name": "breast_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "breast_hetero_host", "namespace": f"experiment{namespace}"}
    # guest_train_data = {"name": "default_credit_hetero_guest", "namespace": f"experiment{namespace}"}
    # host_train_data = {"name": "default_credit_hetero_host", "namespace": f"experiment{namespace}"}

    # initialize pipeline
    pipeline = PipeLine()
    # set job initiator
    pipeline.set_initiator(role='guest', party_id=guest)
    # set participants information
    pipeline.set_roles(guest=guest, host=hosts)

    # define Reader components to read in data
    reader_0 = Reader(name="reader_0")
    # configure Reader for guest
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    # configure Reader for host
    reader_0.get_party_instance(role='host', party_id=hosts).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", output_format='dense')

    # get DataTransform party instance of guest
    data_transform_0_guest_party_instance = data_transform_0.get_party_instance(role='guest', party_id=guest)
    # configure DataTransform for guest
    data_transform_0_guest_party_instance.component_param(with_label=True)
    # get and configure DataTransform party instance of host
    data_transform_0.get_party_instance(role='host', party_id=hosts).component_param(with_label=False)

    # define Intersection components
    intersection_0 = Intersection(name="intersection_0")

    pipeline.add_component(reader_0)

    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))

    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))

    statistic_param = {
        "name": "statistic_0",
        "statistics": ["95%", "coefficient_of_variance", "stddev"],
        "column_indexes": -1,
        "column_names": []
    }
    statistic_0 = DataStatistics(**statistic_param)
    pipeline.add_component(statistic_0, data=Data(data=intersection_0.output.data))

    pipeline.compile()

    # fit model
    pipeline.fit()
    # query component summary
    prettify(pipeline.get_component("statistic_0").get_summary())


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

pipeline-data-statistics-partial-column-name.py

import argparse
import json

from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component.evaluation import Evaluation
from pipeline.component.intersection import Intersection
from pipeline.component.reader import Reader
from pipeline.interface.data import Data
from pipeline.interface.model import Model
from pipeline.component.data_statistics import DataStatistics
from pipeline.utils.tools import load_job_config


def prettify(response, verbose=True):
    if verbose:
        print(json.dumps(response, indent=4, ensure_ascii=False))
        print()
    return response


def main(config="../../config.yaml", namespace=""):
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    hosts = parties.host[0]

    guest_train_data = {"name": "breast_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "breast_hetero_host", "namespace": f"experiment{namespace}"}
    # guest_train_data = {"name": "default_credit_hetero_guest", "namespace": f"experiment{namespace}"}
    # host_train_data = {"name": "default_credit_hetero_host", "namespace": f"experiment{namespace}"}

    # initialize pipeline
    pipeline = PipeLine()
    # set job initiator
    pipeline.set_initiator(role='guest', party_id=guest)
    # set participants information
    pipeline.set_roles(guest=guest, host=hosts)

    # define Reader components to read in data
    reader_0 = Reader(name="reader_0")
    # configure Reader for guest
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    # configure Reader for host
    reader_0.get_party_instance(role='host', party_id=hosts).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", output_format='dense')

    # get DataTransform party instance of guest
    data_transform_0_guest_party_instance = data_transform_0.get_party_instance(role='guest', party_id=guest)
    # configure DataTransform for guest
    data_transform_0_guest_party_instance.component_param(with_label=True)
    # get and configure DataTransform party instance of host
    data_transform_0.get_party_instance(role='host', party_id=hosts).component_param(with_label=False)

    # define Intersection components
    intersection_0 = Intersection(name="intersection_0")

    pipeline.add_component(reader_0)

    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))

    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))

    statistic_param = {
        "name": "statistic_0",
        "statistics": ["95%", "coefficient_of_variance", "stddev"],
        "column_indexes": [1, 2],
        "column_names": ["x3"]
    }
    statistic_0 = DataStatistics(**statistic_param)
    pipeline.add_component(statistic_0, data=Data(data=intersection_0.output.data))

    pipeline.compile()

    # fit model
    pipeline.fit()
    # query component summary
    prettify(pipeline.get_component("statistic_0").get_summary())


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

init.py

data_statistics_pipeline_testsuite.json

{
    "data": [
        {
            "file": "examples/data/breast_hetero_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_1"
        },
        {
            "file": "examples/data/ionosphere_scale_hetero_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "ionosphere_scale_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/ionosphere_scale_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "ionosphere_scale_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        }
    ],
    "pipeline_tasks": {
        "data-statistics-all-columns": {
            "script": "pipeline-data-statistics-all-columns.py"
        },
        "data-statistics-partial-column": {
            "script": "pipeline-data-statistics-partial-column.py"
        },
        "data-statistics-partial-column-name": {
            "script": "pipeline-data-statistics-partial-column-name.py"
        },
        "data-statistics-partial-column-missing": {
            "script": "pipeline-data-statistics-partial-column-missing.py"
        }
    }
}

pipeline-data-statistics-partial-column.py

import argparse
import json

from pipeline.backend.pipeline import PipeLine
from pipeline.component import DataTransform
from pipeline.component import Intersection
from pipeline.component import Reader
from pipeline.interface import Data
from pipeline.interface import Model
from pipeline.component import DataStatistics
from pipeline.utils.tools import load_job_config


def prettify(response, verbose=True):
    if verbose:
        print(json.dumps(response, indent=4, ensure_ascii=False))
        print()
    return response


def main(config="../../config.yaml", namespace=""):
    if isinstance(config, str):
        config = load_job_config(config)
    parties = config.parties
    guest = parties.guest[0]
    hosts = parties.host[0]

    guest_train_data = {"name": "breast_hetero_guest", "namespace": f"experiment{namespace}"}
    host_train_data = {"name": "breast_hetero_host", "namespace": f"experiment{namespace}"}
    # guest_train_data = {"name": "default_credit_hetero_guest", "namespace": f"experiment{namespace}"}
    # host_train_data = {"name": "default_credit_hetero_host", "namespace": f"experiment{namespace}"}

    # initialize pipeline
    pipeline = PipeLine()
    # set job initiator
    pipeline.set_initiator(role='guest', party_id=guest)
    # set participants information
    pipeline.set_roles(guest=guest, host=hosts)

    # define Reader components to read in data
    reader_0 = Reader(name="reader_0")
    # configure Reader for guest
    reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_train_data)
    # configure Reader for host
    reader_0.get_party_instance(role='host', party_id=hosts).component_param(table=host_train_data)

    data_transform_0 = DataTransform(name="data_transform_0", output_format='dense')

    # get DataTransform party instance of guest
    data_transform_0_guest_party_instance = data_transform_0.get_party_instance(role='guest', party_id=guest)
    # configure DataTransform for guest
    data_transform_0_guest_party_instance.component_param(with_label=True)
    # get and configure DataTransform party instance of host
    data_transform_0.get_party_instance(role='host', party_id=hosts).component_param(with_label=False)

    # define Intersection components
    intersection_0 = Intersection(name="intersection_0")

    pipeline.add_component(reader_0)

    pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))

    pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))

    statistic_param = {
        "name": "statistic_0",
        "statistics": ["95%", "coefficient_of_variance", "stddev"],
        "column_indexes": [1, 2],
        "column_names": []
    }
    statistic_0 = DataStatistics(**statistic_param)
    pipeline.add_component(statistic_0, data=Data(data=intersection_0.output.data))

    pipeline.compile()

    # fit model
    pipeline.fit()
    # query component summary
    prettify(pipeline.get_component("statistic_0").get_summary())


if __name__ == "__main__":
    parser = argparse.ArgumentParser("PIPELINE DEMO")
    parser.add_argument("-config", type=str,
                        help="config file")
    args = parser.parse_args()
    if args.config is not None:
        main(args.config)
    else:
        main()

DSL

data_statistics_partial_column_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "statistic_0": {
            "module": "DataStatistics",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

data_statistics_testsuite.json

{
    "data": [
        {
            "file": "examples/data/breast_hetero_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        },
        {
            "file": "examples/data/breast_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "breast_hetero_host",
            "namespace": "experiment",
            "role": "host_1"
        },
        {
            "file": "examples/data/ionosphere_scale_hetero_guest.csv",
            "head": 1,
            "partition": 16,
            "table_name": "ionosphere_scale_hetero_guest",
            "namespace": "experiment",
            "role": "guest_0"
        },
        {
            "file": "examples/data/ionosphere_scale_hetero_host.csv",
            "head": 1,
            "partition": 16,
            "table_name": "ionosphere_scale_hetero_host",
            "namespace": "experiment",
            "role": "host_0"
        }
    ],
    "tasks": {
        "data_statistics_all_columns": {
            "conf": "data_statistics_all_columns_conf.json",
            "dsl": "data_statistics_all_columns_dsl.json"
        },
        "data_statistics_partial_column": {
            "conf": "data_statistics_partial_column_conf.json",
            "dsl": "data_statistics_partial_column_dsl.json"
        },
        "data_statistics_partial_column_name": {
            "conf": "data_statistics_partial_column_name_conf.json",
            "dsl": "data_statistics_partial_column_name_dsl.json"
        },
        "data_statistics_partial_column_missing": {
            "conf": "data_statistics_partial_column_missing_conf.json",
            "dsl": "data_statistics_partial_column_missing_dsl.json"
        }
    }
}

data_statistics_partial_column_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "role": {
            "host": {
                "0": {
                    "data_transform_0": {
                        "with_label": false
                    },
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "guest": {
                "0": {
                    "data_transform_0": {
                        "with_label": true
                    },
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_guest",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "output_format": "dense"
            },
            "statistic_0": {
                "statistics": [
                    "95%",
                    "coefficient_of_variance",
                    "stddev"
                ],
                "column_names": [],
                "column_indexes": [
                    1,
                    2
                ]
            }
        }
    }
}

data_statistics_partial_column_missing_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "role": {
            "guest": {
                "0": {
                    "data_transform_0": {
                        "with_label": true,
                        "label_name": "label"
                    },
                    "reader_0": {
                        "table": {
                            "name": "ionosphere_scale_hetero_guest",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "host": {
                "0": {
                    "data_transform_0": {
                        "with_label": false
                    },
                    "reader_0": {
                        "table": {
                            "name": "ionosphere_scale_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "output_format": "dense",
                "missing_fill": false
            },
            "statistic_0": {
                "statistics": [
                    "95%",
                    "coefficient_of_variance",
                    "stddev"
                ],
                "column_names": [
                    "x3"
                ],
                "column_indexes": [
                    1,
                    2
                ]
            }
        }
    }
}

data_statistics_all_columns_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "role": {
            "host": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_host",
                            "namespace": "experiment"
                        }
                    },
                    "data_transform_0": {
                        "with_label": false
                    }
                }
            },
            "guest": {
                "0": {
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_guest",
                            "namespace": "experiment"
                        }
                    },
                    "data_transform_0": {
                        "with_label": true
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "output_format": "dense"
            },
            "statistic_0": {
                "statistics": [
                    "95%",
                    "coefficient_of_variance",
                    "stddev"
                ],
                "column_names": [],
                "column_indexes": -1
            }
        }
    }
}

data_statistics_partial_column_name_conf.json

{
    "dsl_version": 2,
    "initiator": {
        "role": "guest",
        "party_id": 9999
    },
    "role": {
        "host": [
            10000
        ],
        "guest": [
            9999
        ]
    },
    "component_parameters": {
        "role": {
            "guest": {
                "0": {
                    "data_transform_0": {
                        "with_label": true
                    },
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_guest",
                            "namespace": "experiment"
                        }
                    }
                }
            },
            "host": {
                "0": {
                    "data_transform_0": {
                        "with_label": false
                    },
                    "reader_0": {
                        "table": {
                            "name": "breast_hetero_host",
                            "namespace": "experiment"
                        }
                    }
                }
            }
        },
        "common": {
            "data_transform_0": {
                "output_format": "dense"
            },
            "statistic_0": {
                "statistics": [
                    "95%",
                    "coefficient_of_variance",
                    "stddev"
                ],
                "column_names": [
                    "x3"
                ],
                "column_indexes": [
                    1,
                    2
                ]
            }
        }
    }
}

data_statistics_partial_column_name_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "statistic_0": {
            "module": "DataStatistics",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

data_statistics_all_columns_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "statistic_0": {
            "module": "DataStatistics",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

data_statistics_partial_column_missing_dsl.json

{
    "components": {
        "reader_0": {
            "module": "Reader",
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "data_transform_0": {
            "module": "DataTransform",
            "input": {
                "data": {
                    "data": [
                        "reader_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        },
        "intersection_0": {
            "module": "Intersection",
            "input": {
                "data": {
                    "data": [
                        "data_transform_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ]
            }
        },
        "statistic_0": {
            "module": "DataStatistics",
            "input": {
                "data": {
                    "data": [
                        "intersection_0.data"
                    ]
                }
            },
            "output": {
                "data": [
                    "data"
                ],
                "model": [
                    "model"
                ]
            }
        }
    }
}

Last update: 2021-11-08

Data Statistic¶

Param¶

statistics_param ¶

Classes¶

StatisticsParam (BaseParam) ¶

Parameters¶

Examples¶

`statistics_param` ¶

`StatisticsParam (BaseParam)` ¶