Metrics and checks#

Note

There are two main parts related to handling of metrics and checks in Dataiku’s Python APIs:

dataiku.core.metrics.ComputedMetrics in the dataiku package. It was initially designed for usage within DSS
dataikuapi.dss.metrics.ComputedMetrics in the dataikuapi package. It was initially designed for usage outside of DSS.

Both classes have fairly similar capabilities

For more details on the two packages, please see Concepts and examples

Add metric on a column#

def add_metrics_probes_col_stats(probes, aggregation, column):
    """
    Add a metrics of column statistics to probes
    :param probes: the list of existing probes
    :param aggregation: which aggregation is used
    :param column: the column dataset to use

     Usage example:

    .. code-block:: python

        settings: DSSDatasetSettings = dataset.get_settings()
        metrics: ComputedMetrics = settings.get_raw()['metrics']
        add_metrics_probes_col_stats(metrics['probes'], 'MIN', 'purchase_amount')

    """

    types_index = next((index for (index, d) in enumerate(probes) if d["type"] == 'col_stats'), None)
    if types_index:
        types_value = probes[types_index]
        existing_aggregation = types_value['configuration']['aggregates']
        to_append = {'aggregated': aggregation, 'column': column}
        if to_append not in existing_aggregation:
            existing_aggregation.append(to_append)
    else:
        probes.append({'computeOnBuildMode': 'NO',
                       'configuration': {'aggregates': [{'aggregated': aggregation,
                                                         'column': column}
                                                        ]},
                       'enabled': True,
                       'meta': {'level': 2, 'name': 'Columns statistics'},
                       'type': 'col_stats'})

settings = dataset.get_settings()
metrics = settings.get_raw()['metrics']
add_metrics_probes_col_stats(metrics['probes'], 'MIN', 'purchase_amount')

Make a defined metric visible#

def add_displayed_state_to_metrics(displayed_state, type_to_add, function_to_add, column=""):
    """
    Add to the metrics used a new one
    :param displayed_state: the previous state
    :param type_to_add: which kind of metrics
    :param function_to_add: function that been used
    :param column: column if any

    Usage example:
    .. code-block:: python

        settings: DSSDatasetSettings = dataset.get_settings()
        metrics: ComputedMetrics = settings.get_raw()['metrics']
        add_displayed_state_to_metrics(metrics['displayedState'], 'col_stats', 'MIN', 'purchase_amount')

    """

    line_to_add = type_to_add + ':' + function_to_add
    if column:
        line_to_add += ':' + column
    if line_to_add not in displayed_state['metrics']:
        displayed_state['metrics'].append(line_to_add)

settings = dataset.get_settings()
metrics = settings.get_raw()['metrics']
add_displayed_state_to_metrics(metrics['displayedState'], 'col_stats', 'MIN', 'purchase_amount')

Define a new numerical check#

def add_metrics_checks_numeric_range(checks, label, which, parameters):
    """
    Add a metric if only it doesn't exist
    :param checks: Existing checks
    :param label: Label for the check
    :param which: Probe for the check
    :param parameters: Operation to check
    
    Usage example:
    .. code-block:: python

        settings: DSSDatasetSettings = dataset.get_settings()
        checks = settings.get_raw()['metricsChecks']
        CHECK_RECORDS_NAME = 'Number of records should be greater than 100'
        add_metrics_checks_numeric_range(checks, CHECK_RECORDS_NAME, 'records:COUNT_RECORDS',
                                         [('minimum', 100)])

    """

    is_already_present = next((check for check in checks['checks'] if check['type'] == 'numericRange' and
                               check['metricId'] == which), None)
    if not is_already_present:
        new_metric = {
            'computeOnBuildMode': 'PARTITION',
            'meta': {
                'label': label,
                'name': 'Value in range'
            },
            'metricId': which,
            'maximum': 0.0,
            'maximumEnabled': False,
            'minimum': 0.0,
            'minimumEnabled': False,
            'softMaximum': 0.0,
            'softMaximumEnabled': False,
            'softMinimum': 0.0,
            'softMinimumEnabled': False,
            'type': 'numericRange'
        }
        for parameter in parameters:
            new_metric[parameter[0]] = parameter[1]
            new_metric[parameter[0] + 'Enabled'] = True
        checks['checks'].append(new_metric)

setting = dataset.get_settings()
checks = settings.get_raw()['metricsChecks']
CHECK_RECORDS_NAME = 'Number of records should be greater than 100'
add_metrics_checks_numeric_range(checks, CHECK_RECORDS_NAME, 'records:COUNT_RECORDS',
                                         [('minimum', 100)])

Make a defined check visible#

def set_check_visible(checks, label):
    """
    Add a defined checks to the displayed state (so the user can see it in the GUI)
    :param checks: the metricsChecks part of the dataset settings
    :param label: label to use
    :return:
    
    Usage example:
    .. code-block:: python

        settings: DSSDatasetSettings = dataset.get_settings()
        CHECK_RECORDS_NAME = 'Number of records should be greater than 100'
        checks = settings.get_raw()['metricsChecks']
        set_check_visible(checks, CHECK_RECORDS_NAME)

    """

    displayed_state = checks['displayedState']
    displayed = displayed_state['checks']
    if label not in displayed:
        displayed.append(label)

settings = dataset.get_settings()
CHECK_RECORDS_NAME = 'Number of records should be greater than 100'
checks = settings.get_raw()['metricsChecks']
set_check_visible(checks, CHECK_RECORDS_NAME)

Retrieve metric results#

def get_metrics(dataset):
    """
    Compute and return all used metrics (only id) for a particular dataset
    :param dataset: the dataset

    Usage example:
    .. code-block:: python

        last_metrics = dataset.get_last_metric_values()
        metrics = get_metrics(dataset)
        for metric in metrics:
            metric_value = last_metrics.get_metric_by_id(metric)
            if metric_value and metric_value['lastValues']:
                result[metric] = {
                    'initialValue': metric_value['lastValues'][0]['value']
                }
    """
    dataset.compute_metrics()
    last_metrics = dataset.get_last_metric_values().get_raw()
    return_list = list()
    id_metrics = list(map((lambda metric: metric['metric']['id']),
                          filter(lambda metric: metric['displayedAsMetric'], last_metrics['metrics'])))
    return_list.extend(id_metrics)
    return return_list

result = {}

last_metrics = dataset.get_last_metric_values()
metrics = get_metrics(dataset)
for metric in metrics:
    metric_value = last_metrics.get_metric_by_id(metric)
    if metric_value and metric_value['lastValues']:
        result[metric] = {
            'initialValue': metric_value['lastValues'][0]['value']
        }
        
print(result)

Retrieve check results#

def get_checks_used(settings):
    """
    Get the list of all used checks for a dataset
    :param settings: the settings of the dataset
    :return: the list of all checks used for this dataset
    """
    return list(map((lambda check: 'check:CHECK:'+check), settings['metricsChecks']['displayedState']['checks']))

def get_checks(dataset):
    """
    Compute and return all used checks (only id) for a particular dataset
    :param dataset: the dataset

    Usage example:
    .. code-block:: python

        last_metrics = dataset.get_last_metric_values()
        checks = get_checks(dataset)
        for check in checks:
            check_value = last_metrics.get_metric_by_id(metric)
            if check_value and check_value['lastValues']:
                result[metric] = {
                    'initialValue': metric_value['lastValues'][0]['value']
                }
    """
    dataset.compute_metrics()
    dataset.run_checks()
    return_list = list()
    return_list.extend(get_checks_used(dataset.get_settings().get_raw()))
    return return_list

Reference documentation#

Classes#

`dataiku.core.metrics.ComputedMetrics`(raw)	Handle to the metrics of a DSS object and their last computed value
`dataiku.core.metrics.MetricDataPoint`(raw)	A value of a metric, on a partition
`dataiku.core.metrics.ComputedChecks`(raw)	Handle to the checks of a DSS object and their last computed value
`dataikuapi.dss.metrics.ComputedMetrics`(raw)	Handle to the metrics of a DSS object and their last computed value

Functions#

`compute_metrics`([partition, metric_ids, probes])	Compute metrics on a partition of this dataset.
`get_last_metric_values`([partition])	Get the last values of the metrics on this dataset
`get_settings`()	Get the settings of this dataset as a `DSSDatasetSettings`, or one of its subclasses.
`get_raw`()	Get the raw dataset settings as a dict.
`run_checks`([partition, checks])	Run checks on a partition of this dataset.