Source code for queens.data_processors.csv_file

#
# SPDX-License-Identifier: LGPL-3.0-or-later
# Copyright (c) 2024-2025, QUEENS contributors.
#
# This file is part of QUEENS.
#
# QUEENS is free software: you can redistribute it and/or modify it under the terms of the GNU
# Lesser General Public License as published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version. QUEENS is distributed in the hope that it will
# be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You
# should have received a copy of the GNU Lesser General Public License along with QUEENS. If not,
# see <https://www.gnu.org/licenses/>.
#
"""Data processor class for csv data extraction."""

import logging

import numpy as np
import pandas as pd

from queens.data_processors._data_processor import DataProcessor
from queens.utils.logger_settings import log_init_args
from queens.utils.valid_options import get_option

_logger = logging.getLogger(__name__)


[docs] class CsvFile(DataProcessor): """Class for extracting data from csv files. Attributes: use_cols_lst (lst): List with column numbers that should be read-in. filter_type (str): Filter type to use. header_row (int): Integer that determines which csv-row contains labels/headers of the columns. Default is 'None', meaning no header used. skip_rows (int): Number of rows that should be skipped to be read-in in csv file. index_column (int, str): Column to use as the row labels of the DataFrame, either given as string name or column index. **Note:** *index_column=False* can be used to force pandas to not use the first column as the index. *index_column* is used for filtering the remaining columns. use_rows_lst (lst): In case this options is used, the list contains the indices of rows in the csv file that should be used as data. filter_range (lst): After data is selected by *use_cols_lst* and a filter column is specified by *index_column*, this option selects which data range shall be filtered by providing a minimum and maximum value pair in list format. filter_target_values (list): Target values to filter. filter_tol (float): Tolerance for the filter range. returned_filter_format (str): Returned data format after filtering. """ expected_filter_entire_file = {"type": "entire_file"} expected_filter_by_row_index = {"type": "by_row_index", "rows": [1, 2]} expected_filter_by_target_values = { "type": "by_target_values", "target_values": [1.0, 2.0, 3.0], "tolerance": 0.0, } expected_filter_by_range = {"type": "by_range", "range": [1.0, 2.0], "tolerance": 0.0} @log_init_args def __init__( self, file_name_identifier=None, file_options_dict=None, files_to_be_deleted_regex_lst=None, ): """Instantiate data processor class for csv data. Args: file_name_identifier (str): Identifier of file name The file prefix can contain regex expression and subdirectories. file_options_dict (dict): Dictionary with read-in options for the file: - header_row (int): Integer that determines which csv-row contains labels/headers of the columns. Default is 'None', meaning no header used. - use_cols_lst (lst): (optional) list with column numbers that should be read-in. - skip_rows (int): Number of rows that should be skipped to be read-in in csv file. - index_column (int, str): Column to use as the row labels of the DataFrame, either given as string name or column index. Note: index_column=False can be used to force pandas to not use the first column as the index. Index_column is used for filtering the remaining columns. - returned_filter_format (str): Returned data format after filtering - filter (dict): Dictionary with filter options: -- type (str): Filter type to use -- rows (lst): In case this options is used, the list contains the indices of rows in the csv file that should be used as data -- range (lst): After data is selected by `use_cols_lst` and a filter column is specified by `index_column`, this option selects which data range shall be filtered by providing a minimum and maximum value pair in list format -- target_values (list): target values to filter -- tolerance (float): Tolerance for the filter range files_to_be_deleted_regex_lst (lst): List with paths to files that should be deleted. The paths can contain regex expressions. Returns: Instance of CsvFile class """ super().__init__( file_name_identifier=file_name_identifier, file_options_dict=file_options_dict, files_to_be_deleted_regex_lst=files_to_be_deleted_regex_lst, ) header_row = file_options_dict.get("header_row") if header_row and not isinstance(header_row, int): raise ValueError( "The option 'header_row' in the data_processor settings must be of type 'int'! " f"You provided type '{type(header_row)}'. Abort..." ) use_cols_lst = file_options_dict.get("use_cols_lst") if use_cols_lst and not isinstance(use_cols_lst, list): raise TypeError( "The option 'use_cols_lst' must be of type 'list' " f"but you provided type {type(use_cols_lst)}. Abort..." ) skip_rows = file_options_dict.get("skip_rows", 0) if not isinstance(skip_rows, int): raise ValueError( "The option 'skip_rows' in the data_processor settings must be of type 'int'! " f"You provided type '{type(skip_rows)}'. Abort..." ) index_column = file_options_dict.get("index_column", False) if index_column and not isinstance(index_column, (int, str)): raise TypeError( "The option 'index_column' must be either of type 'int' or 'str', " f"but you provided type {type(index_column)}! Either your original data " "type is wrong or the column does not exist in the csv-data file! " "Abort..." ) returned_filter_format = file_options_dict.get("returned_filter_format", "numpy") filter_options_dict = file_options_dict.get("filter") self.check_valid_filter_options(filter_options_dict) filter_type = filter_options_dict.get("type") if not isinstance(filter_type, str): raise ValueError( "The option 'type' in the data_processor settings must be of type 'str'! " f"You provided type '{type(filter_type)}'. Abort..." ) use_rows_lst = filter_options_dict.get("rows", []) if not isinstance(use_rows_lst, list): raise TypeError( "The option 'rows' must be of type 'list' " f"but you provided type {type(use_rows_lst)}. Abort..." ) if not all(isinstance(row_idx, int) for row_idx in use_rows_lst): raise TypeError( "The option 'rows' must be a list of `int` " f"but you provided type {[type(row_idx) for row_idx in use_rows_lst]}. Abort..." ) filter_range = filter_options_dict.get("range", []) if filter_range and not isinstance(filter_range, list): raise TypeError( "The option 'range' has to be of type 'list', " f"but you provided type {type(filter_range)}. Abort..." ) filter_target_values = filter_options_dict.get("target_values", []) if not isinstance(filter_target_values, list): raise TypeError( "The option 'target_values' has to be of type 'list', " f"but you provided type {type(filter_target_values)}. Abort..." ) filter_tol = filter_options_dict.get("tolerance", 0.0) if not isinstance(filter_tol, float): raise TypeError( "The option 'tolerance' has to be of type 'float', " f"but you provided type {type(filter_tol)}. Abort..." ) self.use_cols_lst = use_cols_lst self.filter_type = filter_type self.header_row = header_row self.skip_rows = skip_rows self.index_column = index_column self.use_rows_lst = use_rows_lst self.filter_range = filter_range self.filter_target_values = filter_target_values self.filter_tol = filter_tol self.returned_filter_format = returned_filter_format
[docs] @classmethod def check_valid_filter_options(cls, filter_options_dict): """Check valid filter input options. Args: filter_options_dict (dict): dictionary with filter options """ if filter_options_dict["type"] == "entire_file": if not filter_options_dict.keys() == cls.expected_filter_entire_file.keys(): raise TypeError( "For the filter type `entire_file`, you have to provide " f"a dictionary of type {cls.expected_filter_entire_file}." ) return if filter_options_dict["type"] == "by_range": if not filter_options_dict.keys() == cls.expected_filter_by_range.keys(): raise TypeError( "For the filter type `by_range`, you have to provide " f"a dictionary of type {cls.expected_filter_by_range}." ) return if filter_options_dict["type"] == "by_row_index": if not filter_options_dict.keys() == cls.expected_filter_by_row_index.keys(): raise TypeError( "For the filter type `by_row_index`, you have to provide " f"a dictionary of type {cls.expected_filter_by_row_index}." ) return if filter_options_dict["type"] == "by_target_values": if not filter_options_dict.keys() == cls.expected_filter_by_target_values.keys(): raise TypeError( "For the filter type `by_target_values`, you have to provide " f"a dictionary of type {cls.expected_filter_by_target_values}." ) else: raise TypeError("You provided an invalid 'filter_type'!")
[docs] def get_raw_data_from_file(self, file_path): """Get the raw data from the files of interest. This method loads the desired parts of the csv file as a pandas dataframe. Args: file_path (str): Actual path to the file of interest. Returns: raw_data (DataFrame): Raw data from file. """ try: raw_data = pd.read_csv( file_path, sep=r",|\s+", usecols=self.use_cols_lst, skiprows=self.skip_rows, header=self.header_row, engine="python", index_col=self.index_column, ) _logger.info("Successfully read-in data from %s.", file_path) return raw_data except IOError as error: _logger.warning( "Could not read the file: %s. The following IOError was raised: %s. " "Skipping the file and continuing.", file_path, error, ) return None
[docs] def filter_and_manipulate_raw_data(self, raw_data): """Filter the pandas data-frame based on filter type. Args: raw_data (DataFrame): Raw data from file. Returns: processed_data (np.array): Cleaned, filtered or manipulated *data_processor* data. """ valid_filter_types = { "entire_file": self._filter_entire_file, "by_range": self._filter_by_range, "by_row_index": self._filter_by_row_index, "by_target_values": self._filter_by_target_values, } error_message = "You provided an invalid 'filter_type'!" filter_method = get_option( valid_filter_types, self.filter_type, error_message=error_message ) processed_data = filter_method(raw_data) filter_formats_dict = { "numpy": processed_data.to_numpy(), "dict": processed_data.to_dict("list"), } processed_data = get_option( filter_formats_dict, self.returned_filter_format, error_message="The returned filter format you provided is not a current option.", ) if not np.any(processed_data): raise RuntimeError( "The filtered data was empty! Adjust your filter tolerance or filter range!" ) return processed_data
def _filter_entire_file(self, raw_data): """Keep entire csv file data. Args: raw_data (DataFrame): Raw data from file. Returns: raw_data (DataFrame): Raw data from file. """ return raw_data def _filter_by_row_index(self, raw_data): """Filter the csv file based on given data rows. Args: raw_data (DataFrame): Raw data from file. Returns: DataFrame: Filtered data. """ if any(raw_data): try: return raw_data.iloc[self.use_rows_lst] except IndexError as exception: raise IndexError( f"Index list {self.use_rows_lst} are not contained in raw_file_data. " ) from exception return None def _filter_by_target_values(self, raw_data): """Filter the pandas data frame based on target values. Args: raw_data (DataFrame): Raw data from file. Returns: DataFrame: Filtered data. """ if any(raw_data): target_indices = [] for target_value in self.filter_target_values: target_indices.append( int(np.where(np.abs(raw_data.index - target_value) <= self.filter_tol)[0]) ) return raw_data.iloc[target_indices] return None def _filter_by_range(self, raw_data): """Filter the pandas data frame based on values in a data column. Args: raw_data (DataFrame): Raw data from file. Returns: DataFrame: Filtered data. """ if any(raw_data): range_start = int( np.where(np.abs(raw_data.index - self.filter_range[0]) <= self.filter_tol)[0] ) range_end = int( np.where(np.abs(raw_data.index - self.filter_range[-1]) <= self.filter_tol)[-1] ) return raw_data.iloc[range_start : range_end + 1] return None