-
Notifications
You must be signed in to change notification settings - Fork 2
perf: improve performance with part sizing #28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ import { getStorageConfig } from '../auth/s3-client.js'; | |
| import { formatSize } from '../utils/format.js'; | ||
| import { get, put, list, head } from '@tigrisdata/storage'; | ||
| import { executeWithConcurrency } from '../utils/concurrency.js'; | ||
| import { calculateUploadParams } from '../utils/upload.js'; | ||
| import type { ParsedPath } from '../types.js'; | ||
|
|
||
| type CopyDirection = 'local-to-remote' | 'remote-to-local' | 'remote-to-remote'; | ||
|
|
@@ -106,15 +107,13 @@ async function uploadFile( | |
| return { error: `File not found: ${localPath}` }; | ||
| } | ||
|
|
||
| const fileStream = createReadStream(localPath); | ||
| const fileStream = createReadStream(localPath, { | ||
| highWaterMark: 1024 * 1024, | ||
| }); | ||
| const body = Readable.toWeb(fileStream) as ReadableStream; | ||
|
|
||
| const useMultipart = fileSize !== undefined && fileSize > 16 * 1024 * 1024; | ||
|
|
||
| const { error: putError } = await put(key, body, { | ||
| multipart: useMultipart, | ||
| partSize: useMultipart ? 16 * 1024 * 1024 : undefined, | ||
| queueSize: useMultipart ? 8 : undefined, | ||
| ...calculateUploadParams(fileSize), | ||
| onUploadProgress: showProgress | ||
| ? ({ loaded }) => { | ||
| if (fileSize !== undefined && fileSize > 0) { | ||
|
|
@@ -246,12 +245,8 @@ async function copyObject( | |
| return { error: getError.message }; | ||
| } | ||
|
|
||
| const useMultipart = fileSize !== undefined && fileSize > 16 * 1024 * 1024; | ||
|
|
||
| const { error: putError } = await put(destKey, data, { | ||
| multipart: useMultipart, | ||
| partSize: useMultipart ? 16 * 1024 * 1024 : undefined, | ||
| queueSize: useMultipart ? 8 : undefined, | ||
| ...calculateUploadParams(fileSize), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remote-to-remote batch copies skip multipart upload sizingMedium Severity In Additional Locations (1)There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a CopyObject API on the server-side, are we using that for copying objects within the bucket?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn’t aware of it. Is it what we use for renaming in webconsole? Also, can it be used to move object between buckets? |
||
| onUploadProgress: showProgress | ||
| ? ({ loaded }) => { | ||
| if (fileSize !== undefined && fileSize > 0) { | ||
|
|
@@ -749,7 +744,7 @@ export default async function cp(options: Record<string, unknown>) { | |
|
|
||
| const recursive = !!getOption<boolean>(options, ['recursive', 'r']); | ||
| const direction = detectDirection(src, dest); | ||
| const config = await getStorageConfig(); | ||
| const config = await getStorageConfig({ withCredentialProvider: true }); | ||
|
|
||
| switch (direction) { | ||
| case 'local-to-remote': { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| const MIN_PART_SIZE = 5 * 1024 * 1024; // 5 MB (S3 minimum) | ||
| const MAX_PARTS = 10_000; // S3 hard limit | ||
| const DEFAULT_QUEUE_SIZE = 10; // match AWS CLI | ||
|
|
||
| // Tiered part sizes to balance parallelism vs per-part overhead | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by per-part overhead? The parallelism controls how many parts get uploaded concurrently. |
||
| const ONE_GB = 1024 * 1024 * 1024; | ||
| const TEN_GB = 10 * ONE_GB; | ||
|
|
||
| function tieredPartSize(fileSize: number): number { | ||
| if (fileSize <= ONE_GB) return 5 * 1024 * 1024; // 5 MB — max parallelism | ||
| if (fileSize <= TEN_GB) return 16 * 1024 * 1024; // 16 MB — fewer parts, less overhead | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is no overhead in play when you use 5MB part size with 10GB files as well. The main thing is to make sure that whatever part you choose allows us to upload the file given the limit of 10K parts. So the calculation you need to do is define a minimum part size (such as 5MB), and then make sure we always have < 10K parts regardless of the file size. |
||
| return 32 * 1024 * 1024; // 32 MB — very large files | ||
| } | ||
|
|
||
| export function calculateUploadParams(fileSize?: number) { | ||
| if (!fileSize || fileSize <= MIN_PART_SIZE) { | ||
| return { multipart: false } as const; | ||
| } | ||
|
|
||
| let partSize = tieredPartSize(fileSize); | ||
|
|
||
| // Safety: ensure we don't exceed S3's 10,000-part limit | ||
| if (fileSize / partSize > MAX_PARTS) { | ||
| partSize = Math.ceil(fileSize / MAX_PARTS); | ||
| } | ||
|
|
||
| return { multipart: true, partSize, queueSize: DEFAULT_QUEUE_SIZE } as const; | ||
| } | ||


Uh oh!
There was an error while loading. Please reload this page.