// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // (Doc section: Basic Example) // (Doc section: Includes) #include #include // (Doc section: Includes) // (Doc section: RunMain Start) arrow::Status RunMain() { // (Doc section: RunMain Start) // (Doc section: int8builder 1 Append) // Builders are the main way to create Arrays in Arrow from existing values that are not // on-disk. In this case, we'll make a simple array, and feed that in. // Data types are important as ever, and there is a Builder for each compatible type; // in this case, int8. arrow::Int8Builder int8builder; int8_t days_raw[5] = {1, 12, 17, 23, 28}; // AppendValues, as called, puts 5 values from days_raw into our Builder object. ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5)); // (Doc section: int8builder 1 Append) // (Doc section: int8builder 1 Finish) // We only have a Builder though, not an Array -- the following code pushes out the // built up data into a proper Array. std::shared_ptr days; ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish()); // (Doc section: int8builder 1 Finish) // (Doc section: int8builder 2) // Builders clear their state every time they fill an Array, so if the type is the same, // we can re-use the builder. We do that here for month values. int8_t months_raw[5] = {1, 3, 5, 7, 1}; ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5)); std::shared_ptr months; ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish()); // (Doc section: int8builder 2) // (Doc section: int16builder) // Now that we change to int16, we use the Builder for that data type instead. arrow::Int16Builder int16builder; int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995}; ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5)); std::shared_ptr years; ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish()); // (Doc section: int16builder) // (Doc section: Schema) // Now, we want a RecordBatch, which has columns and labels for said columns. // This gets us to the 2d data structures we want in Arrow. // These are defined by schema, which have fields -- here we get both those object types // ready. std::shared_ptr field_day, field_month, field_year; std::shared_ptr schema; // Every field needs its name and data type. field_day = arrow::field("Day", arrow::int8()); field_month = arrow::field("Month", arrow::int8()); field_year = arrow::field("Year", arrow::int16()); // The schema can be built from a vector of fields, and we do so here. schema = arrow::schema({field_day, field_month, field_year}); // (Doc section: Schema) // (Doc section: RBatch) // With the schema and Arrays full of data, we can make our RecordBatch! Here, // each column is internally contiguous. This is in opposition to Tables, which we'll // see next. std::shared_ptr rbatch; // The RecordBatch needs the schema, length for columns, which all must match, // and the actual data itself. rbatch = arrow::RecordBatch::Make(schema, days->length(), {days, months, years}); std::cout << rbatch->ToString(); // (Doc section: RBatch) // (Doc section: More Arrays) // Now, let's get some new arrays! It'll be the same datatypes as above, so we re-use // Builders. int8_t days_raw2[5] = {6, 12, 3, 30, 22}; ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw2, 5)); std::shared_ptr days2; ARROW_ASSIGN_OR_RAISE(days2, int8builder.Finish()); int8_t months_raw2[5] = {5, 4, 11, 3, 2}; ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw2, 5)); std::shared_ptr months2; ARROW_ASSIGN_OR_RAISE(months2, int8builder.Finish()); int16_t years_raw2[5] = {1980, 2001, 1915, 2020, 1996}; ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw2, 5)); std::shared_ptr years2; ARROW_ASSIGN_OR_RAISE(years2, int16builder.Finish()); // (Doc section: More Arrays) // (Doc section: ArrayVector) // ChunkedArrays let us have a list of arrays, which aren't contiguous // with each other. First, we get a vector of arrays. arrow::ArrayVector day_vecs{days, days2}; // (Doc section: ArrayVector) // (Doc section: ChunkedArray Day) // Then, we use that to initialize a ChunkedArray, which can be used with other // functions in Arrow! This is good, since having a normal vector of arrays wouldn't // get us far. std::shared_ptr day_chunks = std::make_shared(day_vecs); // (Doc section: ChunkedArray Day) // (Doc section: ChunkedArray Month Year) // Repeat for months. arrow::ArrayVector month_vecs{months, months2}; std::shared_ptr month_chunks = std::make_shared(month_vecs); // Repeat for years. arrow::ArrayVector year_vecs{years, years2}; std::shared_ptr year_chunks = std::make_shared(year_vecs); // (Doc section: ChunkedArray Month Year) // (Doc section: Table) // A Table is the structure we need for these non-contiguous columns, and keeps them // all in one place for us so we can use them as if they were normal arrays. std::shared_ptr table; table = arrow::Table::Make(schema, {day_chunks, month_chunks, year_chunks}, 10); std::cout << table->ToString(); // (Doc section: Table) // (Doc section: Ret) return arrow::Status::OK(); } // (Doc section: Ret) // (Doc section: Main) int main() { arrow::Status st = RunMain(); if (!st.ok()) { std::cerr << st << std::endl; return 1; } return 0; } // (Doc section: Main) // (Doc section: Basic Example)