Add streaming support to Arrow helper (#2407)

This commit is contained in:
Josh Mock
2024-11-04 15:47:53 -06:00
committed by GitHub
parent e0c613f898
commit 11939fd22c
7 changed files with 328 additions and 102 deletions

View File

@ -13,6 +13,11 @@
You can find all the API changes You can find all the API changes
https://www.elastic.co/guide/en/elasticsearch/reference/8.16/release-notes-8.16.0.html[here]. https://www.elastic.co/guide/en/elasticsearch/reference/8.16/release-notes-8.16.0.html[here].
[discrete]
===== Support Apache Arrow in ES|QL helper
The ES|QL helper can now return results as an Apache Arrow `Table` or `RecordBatchReader`, which enables high-performance calculations on ES|QL results, even if the response data is larger than the system's available memory. See <<esql-helper>> for more information.
[discrete] [discrete]
==== Fixes ==== Fixes

View File

@ -349,7 +349,7 @@ In this case, the result will be:
body: object | boolean body: object | boolean
statusCode: number statusCode: number
headers: object headers: object
warnings: [string], warnings: string[],
meta: object meta: object
} }
---- ----
@ -410,19 +410,23 @@ The supported request specific options are:
[cols=2*] [cols=2*]
|=== |===
|`ignore` |`ignore`
|`[number]` - HTTP status codes which should not be considered errors for this request. + |`number[]` - HTTP status codes which should not be considered errors for this request. +
_Default:_ `null` _Default:_ `null`
|`requestTimeout` |`requestTimeout`
|`number` - Max request timeout for the request in milliseconds, it overrides the client default. + |`number | string` - Max request timeout for the request in milliseconds, it overrides the client default. +
_Default:_ `30000` _Default:_ `30000`
|`retryOnTimeout`
|`boolean` - Retry requests that have timed out.
_Default:_ `false`
|`maxRetries` |`maxRetries`
|`number` - Max number of retries for the request, it overrides the client default. + |`number` - Max number of retries for the request, it overrides the client default. +
_Default:_ `3` _Default:_ `3`
|`compression` |`compression`
|`string, boolean` - Enables body compression for the request. + |`string | boolean` - Enables body compression for the request. +
_Options:_ `false`, `'gzip'` + _Options:_ `false`, `'gzip'` +
_Default:_ `false` _Default:_ `false`
@ -446,6 +450,10 @@ _Default:_ `null`
|`any` - Custom object per request. _(you can use it to pass data to the clients events)_ + |`any` - Custom object per request. _(you can use it to pass data to the clients events)_ +
_Default:_ `null` _Default:_ `null`
|`opaqueId`
|`string` - Set the `X-Opaque-Id` HTTP header. See {ref}/api-conventions.html#x-opaque-id
_Default:_ `null`
|`maxResponseSize` |`maxResponseSize`
|`number` - When configured, it verifies that the uncompressed response size is lower than the configured number, if it's higher it will abort the request. It cannot be higher than buffer.constants.MAX_STRING_LENTGH + |`number` - When configured, it verifies that the uncompressed response size is lower than the configured number, if it's higher it will abort the request. It cannot be higher than buffer.constants.MAX_STRING_LENTGH +
_Default:_ `null` _Default:_ `null`
@ -458,6 +466,17 @@ _Default:_ `null`
|`AbortSignal` - The AbortSignal instance to allow request abortion. + |`AbortSignal` - The AbortSignal instance to allow request abortion. +
_Default:_ `null` _Default:_ `null`
|`meta`
|`boolean` - Rather than returning the body, return an object containing `body`, `statusCode`, `headers` and `meta` keys +
_Default_: `false`
|`redaction`
|`object` - Options for redacting potentially sensitive data from error metadata. See <<redaction>>.
|`retryBackoff`
|`(min: number, max: number, attempt: number) => number;` - A function that calculates how long to sleep, in seconds, before the next request retry +
_Default:_ A built-in function that uses exponential backoff with jitter.
|=== |===
[discrete] [discrete]

View File

@ -707,3 +707,42 @@ const result = await client.helpers
.esql({ query: 'FROM sample_data | LIMIT 2' }) .esql({ query: 'FROM sample_data | LIMIT 2' })
.toRecords<EventLog>() .toRecords<EventLog>()
---- ----
[discrete]
===== `toArrowReader`
~Added~ ~in~ ~`v8.16.0`~
ES|QL can return results in multiple binary formats, including https://arrow.apache.org/[Apache Arrow]'s streaming format. Because it is a very efficient format to read, it can be valuable for performing high-performance in-memory analytics. And, because the response is streamed as batches of records, it can be used to produce aggregations and other calculations on larger-than-memory data sets.
`toArrowReader` returns a https://arrow.apache.org/docs/js/classes/Arrow_dom.RecordBatchReader.html[`RecordBatchStreamReader`].
[source,ts]
----
const reader = await client.helpers
.esql({ query: 'FROM sample_data' })
.toArrowReader()
// print each record as JSON
for (const recordBatch of reader) {
for (const record of recordBatch) {
console.log(record.toJSON())
}
}
----
[discrete]
===== `toArrowTable`
~Added~ ~in~ ~`v8.16.0`~
If you would like to pull the entire data set in Arrow format but without streaming, you can use the `toArrowTable` helper to get a https://arrow.apache.org/docs/js/classes/Arrow_dom.Table.html[Table] back instead.
[source,ts]
----
const table = await client.helpers
.esql({ query: 'FROM sample_data' })
.toArrowTable()
console.log(table.toArray())
----

View File

@ -35,8 +35,39 @@ class MyTransport extends Transport {
==== Supported content types ==== Supported content types
- `application/json`, in this case the transport will return a plain JavaScript object Depending on the `content-type` of the response, the transport will return the body as different types:
- `text/plain`, in this case the transport will return a plain string
- `application/vnd.mapbox-vector-tile`, in this case the transport will return a Buffer
- `application/vnd.elasticsearch+json`, in this case the transport will return a plain JavaScript object
[cols="1,1"]
|===
|Content-Type |JavaScript type
|`application/json`
|`object`
|`text/plain`
|`string`
|`application/vnd.elasticsearch+json`
|`object`
|`application/vnd.mapbox-vector-tile`
|`Buffer`
|`application/vnd.apache.arrow.stream`
|`Buffer`
|`application/vnd.elasticsearch+arrow+stream`
|`Buffer`
|`application/smile`
|`Buffer`
|`application/vnd.elasticsearch+smile`
|`Buffer`
|`application/cbor`
|`Buffer`
|`application/vnd.elasticsearch+cbor`
|`Buffer`
|===

View File

@ -87,8 +87,8 @@
"zx": "7.2.3" "zx": "7.2.3"
}, },
"dependencies": { "dependencies": {
"@elastic/transport": "^8.9.0", "@elastic/transport": "^8.9.1",
"@apache-arrow/esnext-cjs": "^17.0.0", "apache-arrow": "^18.0.0",
"tslib": "^2.4.0" "tslib": "^2.4.0"
}, },
"tap": { "tap": {

View File

@ -25,7 +25,7 @@ import assert from 'node:assert'
import * as timersPromises from 'node:timers/promises' import * as timersPromises from 'node:timers/promises'
import { Readable } from 'node:stream' import { Readable } from 'node:stream'
import { errors, TransportResult, TransportRequestOptions, TransportRequestOptionsWithMeta } from '@elastic/transport' import { errors, TransportResult, TransportRequestOptions, TransportRequestOptionsWithMeta } from '@elastic/transport'
import { Table, TypeMap, tableFromIPC } from '@apache-arrow/esnext-cjs' import { Table, TypeMap, tableFromIPC, RecordBatchStreamReader } from 'apache-arrow/Arrow.node'
import Client from './client' import Client from './client'
import * as T from './api/types' import * as T from './api/types'
@ -156,7 +156,8 @@ export interface EsqlResponse {
export interface EsqlHelper { export interface EsqlHelper {
toRecords: <TDocument>() => Promise<EsqlToRecords<TDocument>> toRecords: <TDocument>() => Promise<EsqlToRecords<TDocument>>
toArrow: () => Promise<Table<TypeMap>> toArrowTable: () => Promise<Table<TypeMap>>
toArrowReader: () => Promise<RecordBatchStreamReader>
} }
export interface EsqlToRecords<TDocument> { export interface EsqlToRecords<TDocument> {
@ -1003,7 +1004,7 @@ export default class Helpers {
return { records, columns } return { records, columns }
}, },
async toArrow (): Promise<Table<TypeMap>> { async toArrowTable (): Promise<Table<TypeMap>> {
if (metaHeader !== null) { if (metaHeader !== null) {
reqOptions.headers = reqOptions.headers ?? {} reqOptions.headers = reqOptions.headers ?? {}
reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa` reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa`
@ -1013,6 +1014,19 @@ export default class Helpers {
const response = await client.esql.query(params, reqOptions) const response = await client.esql.query(params, reqOptions)
return tableFromIPC(response) return tableFromIPC(response)
},
async toArrowReader (): Promise<RecordBatchStreamReader> {
if (metaHeader !== null) {
reqOptions.headers = reqOptions.headers ?? {}
reqOptions.headers['x-elastic-client-meta'] = `${metaHeader as string},h=qa`
reqOptions.asStream = true
}
params.format = 'arrow'
const response = await client.esql.query(params, reqOptions)
return RecordBatchStreamReader.from(response)
} }
} }

View File

@ -18,7 +18,7 @@
*/ */
import { test } from 'tap' import { test } from 'tap'
import { Table } from '@apache-arrow/esnext-cjs' import * as arrow from 'apache-arrow'
import { connection } from '../../utils' import { connection } from '../../utils'
import { Client } from '../../../' import { Client } from '../../../'
@ -111,7 +111,7 @@ test('ES|QL helper', t => {
t.end() t.end()
}) })
test('toArrow', t => { test('toArrowTable', t => {
t.test('Parses a binary response into an Arrow table', async t => { t.test('Parses a binary response into an Arrow table', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA=' const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
@ -132,8 +132,8 @@ test('ES|QL helper', t => {
Connection: MockConnection Connection: MockConnection
}) })
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrow() const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowTable()
t.ok(result instanceof Table) t.ok(result instanceof arrow.Table)
const table = [...result] const table = [...result]
t.same(table[0], [ t.same(table[0], [
@ -165,7 +165,125 @@ test('ES|QL helper', t => {
Connection: MockConnection Connection: MockConnection
}) })
await client.helpers.esql({ query: 'FROM sample_data' }).toArrow() await client.helpers.esql({ query: 'FROM sample_data' }).toArrowTable()
t.end()
})
t.end()
})
test('toArrowReader', t => {
t.test('Parses a binary response into an Arrow stream reader', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.ok(result.isStream())
const recordBatch = result.next().value
t.same(recordBatch.get(0)?.toJSON(), {
amount: 4.900000095367432,
date: 1729532586965,
})
t.end()
})
t.test('ESQL helper uses correct x-elastic-client-meta helper value', async t => {
const binaryContent = '/////zABAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAAAIAAAB8AAAABAAAAJ7///8UAAAARAAAAEQAAAAAAAoBRAAAAAEAAAAEAAAAjP///wgAAAAQAAAABAAAAGRhdGUAAAAADAAAAGVsYXN0aWM6dHlwZQAAAAAAAAAAgv///wAAAQAEAAAAZGF0ZQAAEgAYABQAEwASAAwAAAAIAAQAEgAAABQAAABMAAAAVAAAAAAAAwFUAAAAAQAAAAwAAAAIAAwACAAEAAgAAAAIAAAAEAAAAAYAAABkb3VibGUAAAwAAABlbGFzdGljOnR5cGUAAAAAAAAAAAAABgAIAAYABgAAAAAAAgAGAAAAYW1vdW50AAAAAAAA/////7gAAAAUAAAAAAAAAAwAFgAOABUAEAAEAAwAAABgAAAAAAAAAAAABAAQAAAAAAMKABgADAAIAAQACgAAABQAAABYAAAABQAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAACgAAAAAAAAAMAAAAAAAAAABAAAAAAAAADgAAAAAAAAAKAAAAAAAAAAAAAAAAgAAAAUAAAAAAAAAAAAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAHwAAAAAAAAAAAACgmZkTQAAAAGBmZiBAAAAAAAAAL0AAAADAzMwjQAAAAMDMzCtAHwAAAAAAAADV6yywkgEAANWPBquSAQAA1TPgpZIBAADV17mgkgEAANV7k5uSAQAA/////wAAAAA='
const MockConnection = connection.buildMockConnection({
onRequest (params) {
const header = params.headers?.['x-elastic-client-meta'] ?? ''
t.ok(header.includes('h=qa'), `Client meta header does not include ESQL helper value: ${header}`)
return {
body: Buffer.from(binaryContent, 'base64'),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.end()
})
t.test('multi-batch support', async t => {
const intType = new arrow.Uint32
const floatType = new arrow.Float32
const schema = new arrow.Schema([
arrow.Field.new('id', intType),
arrow.Field.new('val', floatType)
])
function getBatch(ids: number[], vals: number[]) {
const id = arrow.makeData({ type: intType, data: ids })
const val = arrow.makeData({ type: floatType, data: vals })
return new arrow.RecordBatch({ id, val })
}
const batch1 = getBatch([1, 2, 3], [0.1, 0.2, 0.3])
const batch2 = getBatch([4, 5, 6], [0.4, 0.5, 0.6])
const batch3 = getBatch([7, 8, 9], [0.7, 0.8, 0.9])
const table = new arrow.Table(schema, [
new arrow.RecordBatch(schema, batch1.data),
new arrow.RecordBatch(schema, batch2.data),
new arrow.RecordBatch(schema, batch3.data),
])
const MockConnection = connection.buildMockConnection({
onRequest (_params) {
return {
body: Buffer.from(arrow.tableToIPC(table, "stream")),
statusCode: 200,
headers: {
'content-type': 'application/vnd.elasticsearch+arrow+stream'
}
}
}
})
const client = new Client({
node: 'http://localhost:9200',
Connection: MockConnection
})
const result = await client.helpers.esql({ query: 'FROM sample_data' }).toArrowReader()
t.ok(result.isStream())
let counter = 0
for (const batch of result) {
for (const row of batch) {
counter++
const { id, val } = row.toJSON()
t.equal(id, counter)
// floating points are hard in JS
t.equal((Math.round(val * 10) / 10).toFixed(1), (counter * 0.1).toFixed(1))
}
}
t.end() t.end()
}) })