[Backport 8.16] Add streaming support to Arrow helper (#2430)

This commit is contained in:
Josh Mock
2024-11-04 16:20:06 -06:00
committed by GitHub
parent 38358e20ab
commit bb5fb24d73
8 changed files with 402 additions and 95 deletions

View File

@ -13,6 +13,11 @@
You can find all the API changes
https://www.elastic.co/guide/en/elasticsearch/reference/8.16/release-notes-8.16.0.html[here].
[discrete]
===== Support Apache Arrow in ES|QL helper
The ES|QL helper can now return results as an Apache Arrow `Table` or `RecordBatchReader`, which enables high-performance calculations on ES|QL results, even if the response data is larger than the system's available memory. See <<esql-helper>> for more information.
[discrete]
==== Fixes

View File

@ -349,7 +349,7 @@ In this case, the result will be:
body: object | boolean
statusCode: number
headers: object
warnings: [string],
warnings: string[],
meta: object
}
----
@ -410,19 +410,23 @@ The supported request specific options are:
[cols=2*]
|===
|`ignore`
|`[number]` - HTTP status codes which should not be considered errors for this request. +
|`number[]` - HTTP status codes which should not be considered errors for this request. +
_Default:_ `null`
|`requestTimeout`
|`number` - Max request timeout for the request in milliseconds, it overrides the client default. +
|`number | string` - Max request timeout for the request in milliseconds, it overrides the client default. +
_Default:_ `30000`
|`retryOnTimeout`
|`boolean` - Retry requests that have timed out.
_Default:_ `false`
|`maxRetries`
|`number` - Max number of retries for the request, it overrides the client default. +
_Default:_ `3`
|`compression`
|`string, boolean` - Enables body compression for the request. +
|`string | boolean` - Enables body compression for the request. +
_Options:_ `false`, `'gzip'` +
_Default:_ `false`
@ -446,6 +450,10 @@ _Default:_ `null`
|`any` - Custom object per request. _(you can use it to pass data to the clients events)_ +
_Default:_ `null`
|`opaqueId`
|`string` - Set the `X-Opaque-Id` HTTP header. See {ref}/api-conventions.html#x-opaque-id
_Default:_ `null`
|`maxResponseSize`
|`number` - When configured, it verifies that the uncompressed response size is lower than the configured number, if it's higher it will abort the request. It cannot be higher than buffer.constants.MAX_STRING_LENTGH +
_Default:_ `null`
@ -458,6 +466,17 @@ _Default:_ `null`
|`AbortSignal` - The AbortSignal instance to allow request abortion. +
_Default:_ `null`
|`meta`
|`boolean` - Rather than returning the body, return an object containing `body`, `statusCode`, `headers` and `meta` keys +
_Default_: `false`
|`redaction`
|`object` - Options for redacting potentially sensitive data from error metadata. See <<redaction>>.
|`retryBackoff`
|`(min: number, max: number, attempt: number) => number;` - A function that calculates how long to sleep, in seconds, before the next request retry +
_Default:_ A built-in function that uses exponential backoff with jitter.
|===
[discrete]

View File

@ -707,3 +707,42 @@ const result = await client.helpers
.esql({ query: 'FROM sample_data | LIMIT 2' })
.toRecords<EventLog>()
----
[discrete]
===== `toArrowReader`
~Added~ ~in~ ~`v8.16.0`~
ES|QL can return results in multiple binary formats, including https://arrow.apache.org/[Apache Arrow]'s streaming format. Because it is a very efficient format to read, it can be valuable for performing high-performance in-memory analytics. And, because the response is streamed as batches of records, it can be used to produce aggregations and other calculations on larger-than-memory data sets.
`toArrowReader` returns a https://arrow.apache.org/docs/js/classes/Arrow_dom.RecordBatchReader.html[`RecordBatchStreamReader`].
[source,ts]
----
const reader = await client.helpers
.esql({ query: 'FROM sample_data' })
.toArrowReader()
// print each record as JSON
for (const recordBatch of reader) {
for (const record of recordBatch) {
console.log(record.toJSON())
}
}
----
[discrete]
===== `toArrowTable`
~Added~ ~in~ ~`v8.16.0`~
If you would like to pull the entire data set in Arrow format but without streaming, you can use the `toArrowTable` helper to get a https://arrow.apache.org/docs/js/classes/Arrow_dom.Table.html[Table] back instead.
[source,ts]
----
const table = await client.helpers
.esql({ query: 'FROM sample_data' })
.toArrowTable()
console.log(table.toArray())
----

View File

@ -35,8 +35,39 @@ class MyTransport extends Transport {
==== Supported content types
- `application/json`, in this case the transport will return a plain JavaScript object
- `text/plain`, in this case the transport will return a plain string
- `application/vnd.mapbox-vector-tile`, in this case the transport will return a Buffer
- `application/vnd.elasticsearch+json`, in this case the transport will return a plain JavaScript object
Depending on the `content-type` of the response, the transport will return the body as different types:
[cols="1,1"]
|===
|Content-Type |JavaScript type
|`application/json`
|`object`
|`text/plain`
|`string`
|`application/vnd.elasticsearch+json`
|`object`
|`application/vnd.mapbox-vector-tile`
|`Buffer`
|`application/vnd.apache.arrow.stream`
|`Buffer`
|`application/vnd.elasticsearch+arrow+stream`
|`Buffer`
|`application/smile`
|`Buffer`
|`application/vnd.elasticsearch+smile`
|`Buffer`
|`application/cbor`
|`Buffer`
|`application/vnd.elasticsearch+cbor`
|`Buffer`
|===

View File

@ -87,7 +87,8 @@
"zx": "^7.2.2"
},
"dependencies": {
"@elastic/transport": "^8.7.0",
"@elastic/transport": "^8.9.1",
"apache-arrow": "^18.0.0",
"tslib": "^2.4.0"
},
"tap": {

View File

@ -25,6 +25,7 @@ import assert from 'node:assert'
import * as timersPromises from 'node:timers/promises'
import { Readable } from 'node:stream'
import { errors, TransportResult, TransportRequestOptions, TransportRequestOptionsWithMeta } from '@elastic/transport'
import { Table, TypeMap, tableFromIPC, RecordBatchStreamReader } from 'apache-arrow/Arrow.node'
import Client from './client'
import * as T from './api/types'
@ -155,6 +156,8 @@ export interface EsqlResponse {
export interface EsqlHelper {
toRecords: <TDocument>() => Promise<EsqlToRecords<TDocument>>
toArrowTable: () => Promise<Table<TypeMap>>
toArrowReader: () => Promise<RecordBatchStreamReader>
}
export interface EsqlToRecords<TDocument> {
@ -985,6 +988,8 @@ export default class Helpers {
})
}
const metaHeader = this[kMetaHeader]
const helper: EsqlHelper = {
/**
* Pivots ES|QL query results into an array of row objects, rather than the default format where each row is an array of values.
@ -996,6 +1001,31 @@ export default class Helpers {
const records: TDocument[] = toRecords(response)
const { columns } = response
return { records, columns }
},
async toArrowTable (): Promise<Table<TypeMap>> {
if (metaHeader !== null) {
reqOptions.headers = reqOptions.headers ?? {}
reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa`
}
params.format = 'arrow'
const response = await client.esql.query(params, reqOptions)
return tableFromIPC(response)
},
async toArrowReader (): Promise<RecordBatchStreamReader> {
if (metaHeader !== null) {
reqOptions.headers = reqOptions.headers ?? {}
reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa`
reqOptions.asStream = true
}
params.format = 'arrow'
const response = await client.esql.query(params, reqOptions)
return RecordBatchStreamReader.from(response)
}
}

View File

@ -18,6 +18,7 @@
*/
import { test } from 'tap'
import * as arrow from 'apache-arrow/Arrow.node'
import { connection } from '../../utils'
import { Client } from '../../../'
@ -109,5 +110,185 @@ test('ES|QL helper', t => {
t.end()
})
test('toArrowTable', t => {
t.test('Parses a binary response into an Arrow table', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowTable()
t.ok(result instanceof arrow.Table)
const table = [...result]
t.same(table[0], [
["amount", 4.900000095367432],
["date", 1729532586965],
])
t.end()
})
t.test('ESQL helper uses correct x-elastic-client-meta helper value', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (params) {
const header = params.headers?.['x-elastic-client-meta'] ?? ''
t.ok(header.includes('h=qa'), `Client meta header does not include ESQL helper value: ${header}`)
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
await client.helpers.esql({ query: 'FROM sample_data' }).toArrowTable()
t.end()
})
t.end()
})
test('toArrowReader', t => {
t.test('Parses a binary response into an Arrow stream reader', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.ok(result.isStream())
const recordBatch = result.next().value
t.same(recordBatch.get(0)?.toJSON(), {
amount: 4.900000095367432,
date: 1729532586965,
})
t.end()
})
t.test('ESQL helper uses correct x-elastic-client-meta helper value', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (params) {
const header = params.headers?.['x-elastic-client-meta'] ?? ''
t.ok(header.includes('h=qa'), `Client meta header does not include ESQL helper value: ${header}`)
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.end()
})
t.test('multi-batch support', async t => {
const intType = new arrow.Uint32
const floatType = new arrow.Float32
const schema = new arrow.Schema([
arrow.Field.new('id', intType),
arrow.Field.new('val', floatType)
])
function getBatch(ids: number[], vals: number[]) {
const id = arrow.makeData({ type: intType, data: ids })
const val = arrow.makeData({ type: floatType, data: vals })
return new arrow.RecordBatch({ id, val })
}
const batch1 = getBatch([1, 2, 3], [0.1, 0.2, 0.3])
const batch2 = getBatch([4, 5, 6], [0.4, 0.5, 0.6])
const batch3 = getBatch([7, 8, 9], [0.7, 0.8, 0.9])
const table = new arrow.Table(schema, [
new arrow.RecordBatch(schema, batch1.data),
new arrow.RecordBatch(schema, batch2.data),
new arrow.RecordBatch(schema, batch3.data),
])
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(arrow.tableToIPC(table, "stream")),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.ok(result.isStream())
let counter = 0
for (const batch of result) {
for (const row of batch) {
counter++
const { id, val } = row.toJSON()
t.equal(id, counter)
// floating points are hard in JS
t.equal((Math.round(val * 10) / 10).toFixed(1), (counter * 0.1).toFixed(1))
}
}
t.end()
})
t.end()
})
t.end()
})

View File

@ -1,6 +1,6 @@
{
"compilerOptions": {
"target": "es2019",
"target": "ES2019",
"module": "commonjs",
"moduleResolution": "node",
"declaration": true,
@ -21,7 +21,8 @@
"importHelpers": true,
"outDir": "lib",
"lib": [
"esnext"
"ES2019",
"dom"
]
},
"formatCodeOptions": {