Use async reader for parsing Apache Arrow responses (#2788)

This commit is contained in:
Josh Mock
2025-04-24 14:11:33 -05:00
committed by GitHub
parent 926b468c6d
commit 710b937bff
3 changed files with 32 additions and 42 deletions

View File

@ -619,7 +619,7 @@ Added in `v8.16.0`
ES|QL can return results in multiple binary formats, including [Apache Arrow](https://arrow.apache.org/)'s streaming format. Because it is a very efficient format to read, it can be valuable for performing high-performance in-memory analytics. And, because the response is streamed as batches of records, it can be used to produce aggregations and other calculations on larger-than-memory data sets.
`toArrowReader` returns a [`RecordBatchStreamReader`](https://arrow.apache.org/docs/js/classes/Arrow_dom.RecordBatchReader.md).
`toArrowReader` returns an [`AsyncRecordBatchStreamReader`](https://github.com/apache/arrow/blob/520ae44272d491bbb52eb3c9b84864ed7088f11a/js/src/ipc/reader.ts#L216).
```ts
const reader = await client.helpers
@ -627,7 +627,7 @@ const reader = await client.helpers
.toArrowReader()
// print each record as JSON
for (const recordBatch of reader) {
for await (const recordBatch of reader) {
for (const record of recordBatch) {
console.log(record.toJSON())
}

View File

@ -11,7 +11,7 @@ import assert from 'node:assert'
import * as timersPromises from 'node:timers/promises'
import { Readable } from 'node:stream'
import { errors, TransportResult, TransportRequestOptions, TransportRequestOptionsWithMeta } from '@elastic/transport'
import { Table, TypeMap, tableFromIPC, RecordBatchStreamReader } from 'apache-arrow/Arrow.node'
import { Table, TypeMap, tableFromIPC, AsyncRecordBatchStreamReader } from 'apache-arrow/Arrow.node'
import Client from './client'
import * as T from './api/types'
import { Id } from './api/types'
@ -135,7 +135,7 @@ export interface EsqlColumn {
export interface EsqlHelper {
toRecords: <TDocument>() => Promise<EsqlToRecords<TDocument>>
toArrowTable: () => Promise<Table<TypeMap>>
toArrowReader: () => Promise<RecordBatchStreamReader>
toArrowReader: () => Promise<AsyncRecordBatchStreamReader>
}
export interface EsqlToRecords<TDocument> {
@ -1000,7 +1000,7 @@ export default class Helpers {
return tableFromIPC(response)
},
async toArrowReader (): Promise<RecordBatchStreamReader> {
async toArrowReader (): Promise<AsyncRecordBatchStreamReader> {
if (metaHeader !== null) {
reqOptions.headers = reqOptions.headers ?? {}
reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa`
@ -1009,9 +1009,9 @@ export default class Helpers {
params.format = 'arrow'
// @ts-expect-error the return type will be ArrayBuffer when the format is set to 'arrow'
const response: ArrayBuffer = await client.esql.query(params, reqOptions)
return RecordBatchStreamReader.from(response)
// @ts-expect-error response is a Readable when asStream is true
const response: Readable = await client.esql.query(params, reqOptions)
return await AsyncRecordBatchStreamReader.from(Readable.from(response))
}
}

View File

@ -182,17 +182,28 @@ test('ES|QL helper', t => {
t.end()
})
test('toArrowReader', t => {
t.test('Parses a binary response into an Arrow stream reader', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
test('toArrowReader', async t => {
const testRecords = [
{ amount: 4.900000095367432, },
{ amount: 8.199999809265137, },
{ amount: 15.5, },
{ amount: 9.899999618530273, },
{ amount: 13.899999618530273, },
]
// build reusable Arrow table
const table = arrow.tableFromJSON(testRecords)
const rawData = await arrow.RecordBatchStreamWriter.writeAll(table).toUint8Array()
t.test('Parses a binary response into an Arrow stream reader', async t => {
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(binaryContent, 'base64'),
body: Buffer.from(rawData),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
'content-type': 'application/vnd.elasticsearch+arrow+stream',
'transfer-encoding': 'chunked'
}
}
}
@ -206,30 +217,8 @@ test('ES|QL helper', t => {
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.ok(result.isStream())
const testRecords = [
{
amount: 4.900000095367432,
date: 1729532586965,
},
{
amount: 8.199999809265137,
date: 1729446186965,
},
{
amount: 15.5,
date: 1729359786965,
},
{
amount: 9.899999618530273,
date: 1729273386965,
},
{
amount: 13.899999618530273,
date: 1729186986965,
},
]
let count = 0
for (const recordBatch of result) {
for await (const recordBatch of result) {
for (const record of recordBatch) {
t.same(record.toJSON(), testRecords[count])
count++
@ -240,17 +229,16 @@ test('ES|QL helper', t => {
})
t.test('ESQL helper uses correct x-elastic-client-meta helper value', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (params) {
const header = params.headers?.['x-elastic-client-meta'] ?? ''
t.ok(header.includes('h=qa'), `Client meta header does not include ESQL helper value: ${header}`)
return {
body: Buffer.from(binaryContent, 'base64'),
body: Buffer.from(rawData),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
'content-type': 'application/vnd.elasticsearch+arrow+stream',
'transfer-encoding': 'chunked'
}
}
}
@ -289,10 +277,12 @@ test('ES|QL helper', t => {
new arrow.RecordBatch(schema, batch3.data),
])
const rawData = await arrow.RecordBatchStreamWriter.writeAll(table).toUint8Array()
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(arrow.tableToIPC(table, "stream")),
body: Buffer.from(rawData),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
@ -310,7 +300,7 @@ test('ES|QL helper', t => {
t.ok(result.isStream())
let counter = 0
for (const batch of result) {
for await (const batch of result) {
for (const row of batch) {
counter++
const { id, val } = row.toJSON()