diff --git a/Cargo.toml b/Cargo.toml index 8c2992c..b5dfdce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,11 +5,14 @@ edition = "2024" [dependencies] clap = { version = "4.5.48", features = ["derive"] } -polars = {version = "0.53.0", optional = true} +polars = {version = "0.53.0", optional = true, features=["lazy"]} rhai = "1.23.6" umya-spreadsheet = "2.3.3" [features] -default = ["csv"] +default = ["csv", "lazyframe"] csv = [] lazyframe = ["dep:polars"] + +[package.metadata.docs.rs] +all-features = true \ No newline at end of file diff --git a/src/arguments.rs b/src/arguments.rs index c725c34..bfd657f 100644 --- a/src/arguments.rs +++ b/src/arguments.rs @@ -88,7 +88,7 @@ impl fmt::Display for NumberRows { } #[derive(Parser, Debug, Clone)] -pub struct RawArguments { +pub(crate) struct RawArguments { /// Path to the xlsx file #[arg()] pub file: String, @@ -128,45 +128,47 @@ pub struct RawArguments { /// replacement for end of line character inside cells #[arg(short = 'R', long)] pub replace_end_of_line_by: Option, - /// a rhai function to filter lines - #[arg(long)] - pub filter: Option, } +/// Arguments structure is used to parse shell arguments (using clap) +/// +/// Argument is also implicitly used by the XlsxReader .with_...() methods. #[derive(Debug, Clone)] pub struct Arguments { /// Path to the xlsx file - pub file: String, - /// List all worksheets - pub list_worksheets: bool, - /// Separator - pub separator: char, - /// Replace separator char in cells by - pub replace_separator_by: Option, - /// include hidden lines to output - // pub include_hidden: IncludeHidden, - pub include_hidden_rows: bool, - pub include_hidden_columns: bool, - /// If merged cells, fill horizontally, vertically, both, or none - // pub fill_merged_cells: FillMergedCells, - pub fill_merged_cells_horizontal: bool, - pub fill_merged_cells_vertical: bool, - /// Chosse worksheet - pub worksheet: String, + pub(crate) file: String, + /// List all worksheets in the xlsx file + pub(crate) list_worksheets: bool, + /// Choose worksheet as name or number + pub(crate) worksheet: String, /// Use the worksheet that was active when the file was last saved - pub active_worksheet: bool, + pub(crate) active_worksheet: bool, /// Trim white spaces at end of cells - pub trim: TrimSpaces, + /// Include hidden lines to output + // pub(crate) include_hidden: IncludeHidden, + pub(crate) include_hidden_rows: bool, + /// Include hidden columns to output + pub(crate) include_hidden_columns: bool, + /// If merged cells, fill horizontally + // pub(crate) fill_merged_cells: FillMergedCells, + pub(crate) fill_merged_cells_horizontal: bool, + /// If merged cells, fill vertically + pub(crate) fill_merged_cells_vertical: bool, + /// trim spaces at end of file + pub(crate) trim: TrimSpaces, /// number the rows in first cell of each line - pub number_rows: NumberRows, - /// avoid nth first rows of xlsx file - pub skip_rows: u32, - /// change end of line character - pub end_of_line: String, - /// replacement for end of line character inside cells - pub replace_end_of_line_by: Option, - /// a rhai function to filter lines - pub filter: Option, + pub(crate) number_rows: NumberRows, + /// Avoid nth first rows of xlsx file + pub(crate) skip_rows: u32, + ///# csv output specific options + /// Separator for output + pub(crate) separator: char, + /// Replace separator char in cells by another + pub(crate) replace_separator_by: Option, + /// Set the end of line string, defaults to "\n" + pub(crate) end_of_line: String, + /// replacement for end of line string inside cells + pub(crate) replace_end_of_line_by: Option, } impl Arguments { @@ -194,7 +196,6 @@ impl Default for Arguments { skip_rows: Default::default(), end_of_line: String::from("\n"), replace_end_of_line_by: Default::default(), - filter: Default::default(), } } } @@ -232,7 +233,6 @@ impl From for Arguments { skip_rows: raw.skip_rows, end_of_line: raw.end_of_line, replace_end_of_line_by: raw.replace_end_of_line_by, - filter: raw.filter, } } } diff --git a/src/error.rs b/src/error.rs index 9011a9f..5b38f86 100644 --- a/src/error.rs +++ b/src/error.rs @@ -5,6 +5,7 @@ pub enum Error { Msg(String), XlsxError(String), IoError(String), + PolarsError(String), } impl fmt::Display for Error { @@ -13,6 +14,7 @@ impl fmt::Display for Error { Error::Msg(msg) => write!(f, "XlsxToCsvError: {msg}"), Error::XlsxError(msg) => write!(f, "XlsxError: {msg}"), Error::IoError(msg) => write!(f, "IoError: {msg}"), + Error::PolarsError(msg) => write!(f, "PolarsError: {msg}"), } } } diff --git a/src/lib.rs b/src/lib.rs index 99e41e2..2e10389 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ pub mod error; pub mod xlsx; pub mod xlsx_builder; -//#[cfg(feature = "csv")] +#[cfg(feature = "csv")] pub mod xlsx_to_csv; #[cfg(feature = "lazyframe")] diff --git a/src/main.rs b/src/main.rs index b9187ed..d978d34 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,13 @@ -use xlsxtocsv::{error::Error, xlsx::XlsxReader, xlsx_to_csv::Output}; +pub use xlsxtocsv::{error::Error, xlsx::XlsxReader}; fn main() -> Result<(), Error> { - XlsxReader::new("noms.xlsx") + let lf = XlsxReader::new("noms.xlsx") .with_active_worksheet() - .to_csv(Output::Stdout)?; + .to_lazyframe()?; + + let df = lf.collect()?; + + println!("{df}"); Ok(()) } diff --git a/src/xlsx.rs b/src/xlsx.rs index 11eb030..9baf3ea 100644 --- a/src/xlsx.rs +++ b/src/xlsx.rs @@ -6,12 +6,13 @@ use umya_spreadsheet::{Cell, Range, Spreadsheet, Worksheet, reader}; use crate::arguments::{Arguments, IntoArgs, NumberRows, TrimSpaces}; use crate::error::Error; +/// A reader for .xlsx files with extended options. #[derive(Debug, Clone)] pub struct XlsxReader { pub(crate) args: Arguments, pub(crate) book: Option, pub(crate) sheet_index: Option, - pub worksheet_dimensions: RefCell>, + pub(crate) worksheet_dimensions: RefCell>, } impl XlsxReader { diff --git a/src/xlsx_to_lazyframe.rs b/src/xlsx_to_lazyframe.rs index e69de29..231f7f6 100644 --- a/src/xlsx_to_lazyframe.rs +++ b/src/xlsx_to_lazyframe.rs @@ -0,0 +1,51 @@ +use crate::{error::Error, xlsx::XlsxReader}; +use polars::prelude::*; + +impl From for Error { + fn from(value: PolarsError) -> Self { + Error::PolarsError(value.to_string()) + } +} + +impl XlsxReader { + pub fn to_lazyframe(mut self) -> Result { + self.finish()?; + let (num_cols, num_rows) = self.get_worksheet_dimensions(); + + let mut columns: Vec = (0..num_cols) + .map(|i| Column::new(format!("column_{i}").into(), Vec::::new())) + .collect(); + + const CHUNK_SIZE: usize = 1000; + let mut chunk: Vec> = Vec::with_capacity(CHUNK_SIZE); + + for current_row in 0..num_rows { + let row = self.get_row(current_row); + + chunk.push(row); + + if chunk.len() >= CHUNK_SIZE { + append_chunk(&mut columns, &chunk); + chunk.clear(); + } + } + + if !chunk.is_empty() { + append_chunk(&mut columns, &chunk); + } + + let df = + DataFrame::new(num_rows as usize, columns).map_err(|e| Error::from(e.to_string()))?; + + Ok(df.lazy()) + } +} + +fn append_chunk(columns: &mut [Column], chunk: &[Vec]) { + for (col_idx, column) in columns.iter_mut().enumerate() { + let chunk_data: Vec = chunk.iter().map(|row| row[col_idx].clone()).collect(); + + let chunk_column = Column::new("temp".into(), chunk_data); + column.append(&chunk_column).unwrap(); + } +}