Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Cloud support for new-streaming scans and sinks #21621

Merged
merged 1 commit into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions crates/polars-plan/src/dsl/scan_sources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,28 @@ impl ScanSourceRef<'_> {
}
}

pub fn to_memslice_async_check_latest(&self, run_async: bool) -> PolarsResult<MemSlice> {
match self {
ScanSourceRef::Path(path) => {
let file = if run_async {
feature_gated!("cloud", {
polars_io::file_cache::FILE_CACHE
.get_entry(path.to_str().unwrap())
// Safety: This was initialized by schema inference.
.unwrap()
.try_open_check_latest()?
})
} else {
polars_utils::open_file(path)?
};

MemSlice::from_file(&file)
},
ScanSourceRef::File(file) => MemSlice::from_file(file),
ScanSourceRef::Buffer(buff) => Ok((*buff).clone()),
}
}

pub fn to_memslice_possibly_async(
&self,
run_async: bool,
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-python/src/lazyframe/sink.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ impl PyPartitioning {
variant: PartitionVariant::MaxSize(max_size),
}
}

#[getter]
fn path(&self) -> &str {
self.path.to_str().unwrap()
}
}

impl<'py> FromPyObject<'py> for SinkTarget {
Expand Down
33 changes: 20 additions & 13 deletions crates/polars-stream/src/nodes/io_sinks/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ use polars_core::frame::DataFrame;
use polars_core::schema::SchemaRef;
use polars_error::PolarsResult;
use polars_expr::state::ExecutionState;
use polars_io::cloud::CloudOptions;
use polars_io::prelude::{CsvWriter, CsvWriterOptions};
use polars_io::utils::file::AsyncWriteable;
use polars_io::SerWriter;
use polars_plan::dsl::SinkOptions;
use polars_utils::priority::Priority;
Expand All @@ -22,19 +24,22 @@ pub struct CsvSinkNode {
schema: SchemaRef,
sink_options: SinkOptions,
write_options: CsvWriterOptions,
cloud_options: Option<CloudOptions>,
}
impl CsvSinkNode {
pub fn new(
schema: SchemaRef,
path: PathBuf,
schema: SchemaRef,
sink_options: SinkOptions,
write_options: CsvWriterOptions,
cloud_options: Option<CloudOptions>,
) -> Self {
Self {
path,
schema,
sink_options,
write_options,
cloud_options,
}
}
}
Expand Down Expand Up @@ -144,35 +149,37 @@ impl SinkNode for CsvSinkNode {
let schema = self.schema.clone();
let include_header = self.write_options.include_header;
let include_bom = self.write_options.include_bom;
let cloud_options = self.cloud_options.clone();
let io_task = polars_io::pl_async::get_runtime().spawn(async move {
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;

let mut file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path.as_path())
.await
.map_err(|err| polars_utils::_limit_path_len_io_err(path.as_path(), err))?;
let mut file = polars_io::utils::file::Writeable::try_new(
path.to_str().unwrap(),
cloud_options.as_ref(),
)?;

// Write the header
if include_header || include_bom {
let mut std_file = file.into_std().await;
let mut writer = CsvWriter::new(&mut std_file)
let mut writer = CsvWriter::new(&mut *file)
.include_bom(include_bom)
.include_header(include_header)
.n_threads(1) // Disable rayon parallelism
.batched(&schema)?;
writer.write_batch(&DataFrame::empty_with_schema(&schema))?;
file = tokio::fs::File::from_std(std_file);
}

let mut file = file.try_into_async_writeable()?;

while let Some(Priority(_, buffer)) = lin_rx.get().await {
file.write_all(&buffer).await?;
}

tokio_sync_on_close(sink_options.sync_on_close, &mut file).await?;
if let AsyncWriteable::Local(file) = &mut file {
tokio_sync_on_close(sink_options.sync_on_close, file).await?;
}

file.close().await?;

PolarsResult::Ok(())
});
join_handles.push(spawn(TaskPriority::Low, async move {
Expand Down
28 changes: 17 additions & 11 deletions crates/polars-stream/src/nodes/io_sinks/ipc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ use polars_core::utils::arrow::io::ipc::write::{
};
use polars_error::PolarsResult;
use polars_expr::state::ExecutionState;
use polars_io::cloud::CloudOptions;
use polars_io::ipc::{IpcWriter, IpcWriterOptions};
use polars_io::utils::file::Writeable;
use polars_io::SerWriter;
use polars_plan::dsl::SinkOptions;
use polars_utils::priority::Priority;
Expand All @@ -39,6 +41,7 @@ pub struct IpcSinkNode {
compat_level: CompatLevel,

chunk_size: usize,
cloud_options: Option<CloudOptions>,
}

impl IpcSinkNode {
Expand All @@ -47,6 +50,7 @@ impl IpcSinkNode {
path: PathBuf,
sink_options: SinkOptions,
write_options: IpcWriterOptions,
cloud_options: Option<CloudOptions>,
) -> Self {
Self {
path,
Expand All @@ -58,6 +62,7 @@ impl IpcSinkNode {
compat_level: CompatLevel::newest(), // @TODO: make this accessible from outside

chunk_size: get_ideal_morsel_size(), // @TODO: change to something more appropriate
cloud_options,
}
}
}
Expand Down Expand Up @@ -322,18 +327,14 @@ impl SinkNode for IpcSinkNode {
let path = self.path.clone();
let sink_options = self.sink_options.clone();
let write_options = self.write_options;
let cloud_options = self.cloud_options.clone();
let input_schema = self.input_schema.clone();
let io_task = polars_io::pl_async::get_runtime().spawn(async move {
use tokio::fs::OpenOptions;

let file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path.as_path())
.await?;
let mut file = file.into_std().await;
let writer = BufWriter::new(&mut file);
let mut file = polars_io::utils::file::Writeable::try_new(
path.to_str().unwrap(),
cloud_options.as_ref(),
)?;
let writer = BufWriter::new(&mut *file);
let mut writer = IpcWriter::new(writer)
.with_compression(write_options.compression)
.with_parallel(false)
Expand All @@ -348,7 +349,12 @@ impl SinkNode for IpcSinkNode {
writer.finish()?;
drop(writer);

sync_on_close(sink_options.sync_on_close, &mut file)?;
if let Writeable::Local(file) = &mut file {
sync_on_close(sink_options.sync_on_close, file)?;
}

file.close()?;

PolarsResult::Ok(())
});
join_handles.push(spawn(TaskPriority::Low, async move {
Expand Down
36 changes: 25 additions & 11 deletions crates/polars-stream/src/nodes/io_sinks/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ use std::path::PathBuf;

use polars_error::PolarsResult;
use polars_expr::state::ExecutionState;
use polars_io::cloud::CloudOptions;
use polars_io::json::BatchedWriter;
use polars_io::utils::file::AsyncWriteable;
use polars_plan::dsl::SinkOptions;
use polars_utils::priority::Priority;

Expand All @@ -17,10 +19,19 @@ type Linearized = Priority<Reverse<MorselSeq>, Vec<u8>>;
pub struct NDJsonSinkNode {
path: PathBuf,
sink_options: SinkOptions,
cloud_options: Option<CloudOptions>,
}
impl NDJsonSinkNode {
pub fn new(path: PathBuf, sink_options: SinkOptions) -> Self {
Self { path, sink_options }
pub fn new(
path: PathBuf,
sink_options: SinkOptions,
cloud_options: Option<CloudOptions>,
) -> Self {
Self {
path,
sink_options,
cloud_options,
}
}
}

Expand Down Expand Up @@ -102,29 +113,32 @@ impl SinkNode for NDJsonSinkNode {
PolarsResult::Ok(())
})
}));
let cloud_options = self.cloud_options.clone();

// IO task.
//
// Task that will actually do write to the target file.
let sink_options = self.sink_options.clone();
let path = self.path.clone();
let io_task = polars_io::pl_async::get_runtime().spawn(async move {
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;

let mut file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path.as_path())
.await
.map_err(|err| polars_utils::_limit_path_len_io_err(path.as_path(), err))?;
let mut file = polars_io::utils::file::AsyncWriteable::try_new(
path.to_str().unwrap(),
cloud_options.as_ref(),
)
.await?;

while let Some(Priority(_, buffer)) = lin_rx.get().await {
file.write_all(&buffer).await?;
}

tokio_sync_on_close(sink_options.sync_on_close, &mut file).await?;
if let AsyncWriteable::Local(file) = &mut file {
tokio_sync_on_close(sink_options.sync_on_close, file).await?;
}

file.close().await?;

PolarsResult::Ok(())
});
join_handles.push(spawn(TaskPriority::Low, async move {
Expand Down
28 changes: 17 additions & 11 deletions crates/polars-stream/src/nodes/io_sinks/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ use polars_core::prelude::{ArrowSchema, CompatLevel};
use polars_core::schema::SchemaRef;
use polars_error::PolarsResult;
use polars_expr::state::ExecutionState;
use polars_io::cloud::CloudOptions;
use polars_io::parquet::write::BatchedWriter;
use polars_io::prelude::{get_encodings, ParquetWriteOptions};
use polars_io::schema_to_arrow_checked;
use polars_io::utils::file::Writeable;
use polars_parquet::parquet::error::ParquetResult;
use polars_parquet::read::ParquetError;
use polars_parquet::write::{
Expand Down Expand Up @@ -40,6 +42,7 @@ pub struct ParquetSinkNode {
parquet_schema: SchemaDescriptor,
arrow_schema: ArrowSchema,
encodings: Vec<Vec<Encoding>>,
cloud_options: Option<CloudOptions>,
}

impl ParquetSinkNode {
Expand All @@ -48,6 +51,7 @@ impl ParquetSinkNode {
path: &Path,
sink_options: SinkOptions,
write_options: &ParquetWriteOptions,
cloud_options: Option<CloudOptions>,
) -> PolarsResult<Self> {
let schema = schema_to_arrow_checked(&input_schema, CompatLevel::newest(), "parquet")?;
let parquet_schema = to_parquet_schema(&schema)?;
Expand All @@ -63,6 +67,7 @@ impl ParquetSinkNode {
parquet_schema,
arrow_schema: schema,
encodings,
cloud_options,
})
}
}
Expand Down Expand Up @@ -251,22 +256,18 @@ impl SinkNode for ParquetSinkNode {
// spawned once.
let path = self.path.clone();
let sink_options = self.sink_options.clone();
let cloud_options = self.cloud_options.clone();
let write_options = self.write_options;
let arrow_schema = self.arrow_schema.clone();
let parquet_schema = self.parquet_schema.clone();
let encodings = self.encodings.clone();
let io_task = polars_io::pl_async::get_runtime().spawn(async move {
use tokio::fs::OpenOptions;
let mut file = polars_io::utils::file::Writeable::try_new(
path.to_str().unwrap(),
cloud_options.as_ref(),
)?;

let file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path.as_path())
.await
.map_err(|err| polars_utils::_limit_path_len_io_err(path.as_path(), err))?;
let mut file = file.into_std().await;
let writer = BufWriter::new(&mut file);
let writer = BufWriter::new(&mut *file);
let write_options = WriteOptions {
statistics: write_options.statistics,
compression: write_options.compression.into(),
Expand All @@ -292,7 +293,12 @@ impl SinkNode for ParquetSinkNode {
writer.finish()?;
drop(writer);

sync_on_close(sink_options.sync_on_close, &mut file)?;
if let Writeable::Local(file) = &mut file {
sync_on_close(sink_options.sync_on_close, file)?;
}

file.close()?;

PolarsResult::Ok(())
});
join_handles.push(spawn(TaskPriority::Low, async move {
Expand Down
Loading
Loading