ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets (#13859)
I intend for this PR to add a few small tutorial articles to the Arrow documentation, for basic Arrow usage, file access, compute, and dataset functionality.
Right now, this is a draft PR, with just the code for the examples. Before I set it up with comments and prose in Sphinx, I wanted to get it reviewed. Do these examples seem suitable for the tutorials they target?
Authored-by: kaesuarez <kaesuarez1423@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
2022-09-20 18:27:45 -04:00
|
|
|
// Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
|
// distributed with this work for additional information
|
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
|
// "License"); you may not use this file except in compliance
|
|
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing,
|
|
|
|
|
// software distributed under the License is distributed on an
|
|
|
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
|
// KIND, either express or implied. See the License for the
|
|
|
|
|
// specific language governing permissions and limitations
|
|
|
|
|
// under the License.
|
|
|
|
|
|
|
|
|
|
// (Doc section: Dataset Example)
|
|
|
|
|
|
|
|
|
|
// (Doc section: Includes)
|
|
|
|
|
#include <arrow/api.h>
|
2025-06-13 10:03:46 +02:00
|
|
|
#include <arrow/compute/api.h>
|
ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets (#13859)
I intend for this PR to add a few small tutorial articles to the Arrow documentation, for basic Arrow usage, file access, compute, and dataset functionality.
Right now, this is a draft PR, with just the code for the examples. Before I set it up with comments and prose in Sphinx, I wanted to get it reviewed. Do these examples seem suitable for the tutorials they target?
Authored-by: kaesuarez <kaesuarez1423@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
2022-09-20 18:27:45 -04:00
|
|
|
#include <arrow/dataset/api.h>
|
|
|
|
|
// We use Parquet headers for setting up examples; they are not required for using
|
|
|
|
|
// datasets.
|
|
|
|
|
#include <parquet/arrow/reader.h>
|
|
|
|
|
#include <parquet/arrow/writer.h>
|
|
|
|
|
|
2023-08-25 03:29:15 +02:00
|
|
|
#include <unistd.h>
|
ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets (#13859)
I intend for this PR to add a few small tutorial articles to the Arrow documentation, for basic Arrow usage, file access, compute, and dataset functionality.
Right now, this is a draft PR, with just the code for the examples. Before I set it up with comments and prose in Sphinx, I wanted to get it reviewed. Do these examples seem suitable for the tutorials they target?
Authored-by: kaesuarez <kaesuarez1423@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
2022-09-20 18:27:45 -04:00
|
|
|
#include <iostream>
|
|
|
|
|
// (Doc section: Includes)
|
|
|
|
|
|
|
|
|
|
// (Doc section: Helper Functions)
|
|
|
|
|
// Generate some data for the rest of this example.
|
|
|
|
|
arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
|
|
|
|
|
// This code should look familiar from the basic Arrow example, and is not the
|
|
|
|
|
// focus of this example. However, we need data to work on it, and this makes that!
|
|
|
|
|
auto schema =
|
|
|
|
|
arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()),
|
|
|
|
|
arrow::field("c", arrow::int64())});
|
|
|
|
|
std::shared_ptr<arrow::Array> array_a;
|
|
|
|
|
std::shared_ptr<arrow::Array> array_b;
|
|
|
|
|
std::shared_ptr<arrow::Array> array_c;
|
|
|
|
|
arrow::NumericBuilder<arrow::Int64Type> builder;
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
|
|
|
|
|
builder.Reset();
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
|
|
|
|
|
builder.Reset();
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
|
|
|
|
|
ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
|
|
|
|
|
return arrow::Table::Make(schema, {array_a, array_b, array_c});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Set up a dataset by writing two Parquet files.
|
|
|
|
|
arrow::Result<std::string> CreateExampleParquetDataset(
|
|
|
|
|
const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
|
|
|
|
|
const std::string& root_path) {
|
|
|
|
|
// Much like CreateTable(), this is utility that gets us the dataset we'll be reading
|
|
|
|
|
// from. Don't worry, we also write a dataset in the example proper.
|
|
|
|
|
auto base_path = root_path + "parquet_dataset";
|
|
|
|
|
ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
|
|
|
|
|
// Create an Arrow Table
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
|
|
|
|
|
// Write it into two Parquet files
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto output,
|
|
|
|
|
filesystem->OpenOutputStream(base_path + "/data1.parquet"));
|
|
|
|
|
ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
|
|
|
|
|
*table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(output,
|
|
|
|
|
filesystem->OpenOutputStream(base_path + "/data2.parquet"));
|
|
|
|
|
ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
|
|
|
|
|
*table->Slice(5), arrow::default_memory_pool(), output, 2048));
|
|
|
|
|
return base_path;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
arrow::Status PrepareEnv() {
|
2025-06-13 10:03:46 +02:00
|
|
|
// Initilize the compute module to register the required kernels for Dataset
|
|
|
|
|
ARROW_RETURN_NOT_OK(arrow::compute::Initialize());
|
ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets (#13859)
I intend for this PR to add a few small tutorial articles to the Arrow documentation, for basic Arrow usage, file access, compute, and dataset functionality.
Right now, this is a draft PR, with just the code for the examples. Before I set it up with comments and prose in Sphinx, I wanted to get it reviewed. Do these examples seem suitable for the tutorials they target?
Authored-by: kaesuarez <kaesuarez1423@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
2022-09-20 18:27:45 -04:00
|
|
|
// Get our environment prepared for reading, by setting up some quick writing.
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
|
|
|
|
|
std::shared_ptr<arrow::fs::FileSystem> setup_fs;
|
|
|
|
|
// Note this operates in the directory the executable is built in.
|
|
|
|
|
char setup_path[256];
|
|
|
|
|
char* result = getcwd(setup_path, 256);
|
|
|
|
|
if (result == NULL) {
|
|
|
|
|
return arrow::Status::IOError("Fetching PWD failed.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, ""));
|
|
|
|
|
|
|
|
|
|
return arrow::Status::OK();
|
|
|
|
|
}
|
|
|
|
|
// (Doc section: Helper Functions)
|
|
|
|
|
|
|
|
|
|
// (Doc section: RunMain)
|
|
|
|
|
arrow::Status RunMain() {
|
|
|
|
|
// (Doc section: RunMain)
|
|
|
|
|
// (Doc section: PrepareEnv)
|
|
|
|
|
ARROW_RETURN_NOT_OK(PrepareEnv());
|
|
|
|
|
// (Doc section: PrepareEnv)
|
|
|
|
|
|
|
|
|
|
// (Doc section: FileSystem Declare)
|
|
|
|
|
// First, we need a filesystem object, which lets us interact with our local
|
|
|
|
|
// filesystem starting at a given path. For the sake of simplicity, that'll be
|
|
|
|
|
// the current directory.
|
|
|
|
|
std::shared_ptr<arrow::fs::FileSystem> fs;
|
|
|
|
|
// (Doc section: FileSystem Declare)
|
|
|
|
|
|
|
|
|
|
// (Doc section: FileSystem Init)
|
|
|
|
|
// Get the CWD, use it to make the FileSystem object.
|
|
|
|
|
char init_path[256];
|
|
|
|
|
char* result = getcwd(init_path, 256);
|
|
|
|
|
if (result == NULL) {
|
|
|
|
|
return arrow::Status::IOError("Fetching PWD failed.");
|
|
|
|
|
}
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
|
|
|
|
|
// (Doc section: FileSystem Init)
|
|
|
|
|
|
|
|
|
|
// (Doc section: FileSelector Declare)
|
|
|
|
|
// A file selector lets us actually traverse a multi-file dataset.
|
|
|
|
|
arrow::fs::FileSelector selector;
|
|
|
|
|
// (Doc section: FileSelector Declare)
|
|
|
|
|
// (Doc section: FileSelector Config)
|
|
|
|
|
selector.base_dir = "parquet_dataset";
|
|
|
|
|
// Recursive is a safe bet if you don't know the nesting of your dataset.
|
|
|
|
|
selector.recursive = true;
|
|
|
|
|
// (Doc section: FileSelector Config)
|
|
|
|
|
// (Doc section: FileSystemFactoryOptions)
|
|
|
|
|
// Making an options object lets us configure our dataset reading.
|
|
|
|
|
arrow::dataset::FileSystemFactoryOptions options;
|
|
|
|
|
// We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
|
|
|
|
|
// schema. We won't set any other options, defaults are fine.
|
|
|
|
|
options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
|
|
|
|
|
// (Doc section: FileSystemFactoryOptions)
|
|
|
|
|
// (Doc section: File Format Setup)
|
|
|
|
|
auto read_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
|
|
|
|
|
// (Doc section: File Format Setup)
|
|
|
|
|
// (Doc section: FileSystemDatasetFactory Make)
|
|
|
|
|
// Now, we get a factory that will let us get our dataset -- we don't have the
|
|
|
|
|
// dataset yet!
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make(
|
|
|
|
|
fs, selector, read_format, options));
|
|
|
|
|
// (Doc section: FileSystemDatasetFactory Make)
|
|
|
|
|
// (Doc section: FileSystemDatasetFactory Finish)
|
|
|
|
|
// Now we build our dataset from the factory.
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto read_dataset, factory->Finish());
|
|
|
|
|
// (Doc section: FileSystemDatasetFactory Finish)
|
|
|
|
|
// (Doc section: Dataset Fragments)
|
|
|
|
|
// Print out the fragments
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto fragments, read_dataset->GetFragments());
|
|
|
|
|
for (const auto& fragment : fragments) {
|
|
|
|
|
std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl;
|
|
|
|
|
std::cout << "Partition expression: "
|
|
|
|
|
<< (*fragment)->partition_expression().ToString() << std::endl;
|
|
|
|
|
}
|
|
|
|
|
// (Doc section: Dataset Fragments)
|
|
|
|
|
// (Doc section: Read Scan Builder)
|
|
|
|
|
// Scan dataset into a Table -- once this is done, you can do
|
|
|
|
|
// normal table things with it, like computation and printing. However, now you're
|
|
|
|
|
// also dedicated to being in memory.
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto read_scan_builder, read_dataset->NewScan());
|
|
|
|
|
// (Doc section: Read Scan Builder)
|
|
|
|
|
// (Doc section: Read Scanner)
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto read_scanner, read_scan_builder->Finish());
|
|
|
|
|
// (Doc section: Read Scanner)
|
|
|
|
|
// (Doc section: To Table)
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, read_scanner->ToTable());
|
|
|
|
|
std::cout << table->ToString();
|
|
|
|
|
// (Doc section: To Table)
|
|
|
|
|
|
|
|
|
|
// (Doc section: TableBatchReader)
|
|
|
|
|
// Now, let's get a table out to disk as a dataset!
|
|
|
|
|
// We make a RecordBatchReader from our Table, then set up a scanner, which lets us
|
|
|
|
|
// go to a file.
|
|
|
|
|
std::shared_ptr<arrow::TableBatchReader> write_dataset =
|
|
|
|
|
std::make_shared<arrow::TableBatchReader>(table);
|
|
|
|
|
// (Doc section: TableBatchReader)
|
|
|
|
|
// (Doc section: WriteScanner)
|
|
|
|
|
auto write_scanner_builder =
|
|
|
|
|
arrow::dataset::ScannerBuilder::FromRecordBatchReader(write_dataset);
|
|
|
|
|
ARROW_ASSIGN_OR_RAISE(auto write_scanner, write_scanner_builder->Finish())
|
|
|
|
|
// (Doc section: WriteScanner)
|
|
|
|
|
// (Doc section: Partition Schema)
|
|
|
|
|
// The partition schema determines which fields are used as keys for partitioning.
|
|
|
|
|
auto partition_schema = arrow::schema({arrow::field("a", arrow::utf8())});
|
|
|
|
|
// (Doc section: Partition Schema)
|
|
|
|
|
// (Doc section: Partition Create)
|
|
|
|
|
// We'll use Hive-style partitioning, which creates directories with "key=value"
|
|
|
|
|
// pairs.
|
|
|
|
|
auto partitioning =
|
|
|
|
|
std::make_shared<arrow::dataset::HivePartitioning>(partition_schema);
|
|
|
|
|
// (Doc section: Partition Create)
|
|
|
|
|
// (Doc section: Write Format)
|
|
|
|
|
// Now, we declare we'll be writing Parquet files.
|
|
|
|
|
auto write_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
|
|
|
|
|
// (Doc section: Write Format)
|
|
|
|
|
// (Doc section: Write Options)
|
|
|
|
|
// This time, we make Options for writing, but do much more configuration.
|
|
|
|
|
arrow::dataset::FileSystemDatasetWriteOptions write_options;
|
|
|
|
|
// Defaults to start.
|
|
|
|
|
write_options.file_write_options = write_format->DefaultWriteOptions();
|
|
|
|
|
// (Doc section: Write Options)
|
|
|
|
|
// (Doc section: Options FS)
|
|
|
|
|
// Use the filesystem we already have.
|
|
|
|
|
write_options.filesystem = fs;
|
|
|
|
|
// (Doc section: Options FS)
|
|
|
|
|
// (Doc section: Options Target)
|
|
|
|
|
// Write to the folder "write_dataset" in current directory.
|
|
|
|
|
write_options.base_dir = "write_dataset";
|
|
|
|
|
// (Doc section: Options Target)
|
|
|
|
|
// (Doc section: Options Partitioning)
|
|
|
|
|
// Use the partitioning declared above.
|
|
|
|
|
write_options.partitioning = partitioning;
|
|
|
|
|
// (Doc section: Options Partitioning)
|
|
|
|
|
// (Doc section: Options Name Template)
|
|
|
|
|
// Define what the name for the files making up the dataset will be.
|
|
|
|
|
write_options.basename_template = "part{i}.parquet";
|
|
|
|
|
// (Doc section: Options Name Template)
|
|
|
|
|
// (Doc section: Options File Behavior)
|
|
|
|
|
// Set behavior to overwrite existing data -- specifically, this lets this example
|
|
|
|
|
// be run more than once, and allows whatever code you have to overwrite what's there.
|
|
|
|
|
write_options.existing_data_behavior =
|
|
|
|
|
arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore;
|
|
|
|
|
// (Doc section: Options File Behavior)
|
|
|
|
|
// (Doc section: Write Dataset)
|
|
|
|
|
// Write to disk!
|
|
|
|
|
ARROW_RETURN_NOT_OK(
|
|
|
|
|
arrow::dataset::FileSystemDataset::Write(write_options, write_scanner));
|
|
|
|
|
// (Doc section: Write Dataset)
|
|
|
|
|
// (Doc section: Ret)
|
|
|
|
|
return arrow::Status::OK();
|
|
|
|
|
}
|
|
|
|
|
// (Doc section: Ret)
|
|
|
|
|
// (Doc section: Main)
|
|
|
|
|
int main() {
|
|
|
|
|
arrow::Status st = RunMain();
|
|
|
|
|
if (!st.ok()) {
|
|
|
|
|
std::cerr << st << std::endl;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
// (Doc section: Main)
|
|
|
|
|
|
|
|
|
|
// (Doc section: Dataset Example)
|