Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduction of random access reading #1183

Merged
merged 1 commit into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions lib/AbstractTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,23 @@ interface INormalizedReadChunkOptions extends IReadChunkOptions {
*/
export abstract class AbstractTokenizer implements ITokenizer {

public fileInfo: IFileInfo;

private onClose?: OnClose;
private numBuffer = new Uint8Array(8);

public abstract fileInfo: IFileInfo;

/**
* Tokenizer-stream position
*/
public position = 0;


/**
* Constructor
* @param options Tokenizer options
* @protected
*/
protected constructor(options?: ITokenizerOptions) {
this.fileInfo = options?.fileInfo ?? {};
this.onClose = options?.onClose;
if (options?.abortSignal) {
options.abortSignal.addEventListener('abort', () => {
Expand All @@ -33,12 +39,7 @@ export abstract class AbstractTokenizer implements ITokenizer {
}
}

/**
* Tokenizer-stream position
*/
public position = 0;

private numBuffer = new Uint8Array(8);
abstract supportsRandomAccess(): boolean;

/**
* Read buffer from tokenizer
Expand Down
16 changes: 13 additions & 3 deletions lib/BufferTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import type { ITokenizerOptions, IReadChunkOptions } from './types.js';
import type {ITokenizerOptions, IReadChunkOptions, IRandomAccessFileInfo, IRandomAccessTokenizer} from './types.js';
import { EndOfStreamError } from 'peek-readable';
import { AbstractTokenizer } from './AbstractTokenizer.js';

export class BufferTokenizer extends AbstractTokenizer {
export class BufferTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer {

public fileInfo: IRandomAccessFileInfo;

/**
* Construct BufferTokenizer
Expand All @@ -11,7 +13,7 @@ export class BufferTokenizer extends AbstractTokenizer {
*/
constructor(private uint8Array: Uint8Array, options?: ITokenizerOptions) {
super(options);
this.fileInfo.size = this.fileInfo.size ? this.fileInfo.size : uint8Array.length;
this.fileInfo = {...options?.fileInfo ?? {}, ...{size: uint8Array.length}};
}

/**
Expand Down Expand Up @@ -55,4 +57,12 @@ export class BufferTokenizer extends AbstractTokenizer {
public close(): Promise<void> {
return super.close();
}

supportsRandomAccess(): boolean {
return true;
}

setPosition(position: number): void {
this.position = position;
}
}
40 changes: 32 additions & 8 deletions lib/FileTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
import { AbstractTokenizer } from './AbstractTokenizer.js';
import { EndOfStreamError } from 'peek-readable';
import type { IReadChunkOptions, ITokenizerOptions } from './types.js';
import type {IRandomAccessTokenizer, IRandomAccessFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js';
import { type FileHandle, open as fsOpen } from 'node:fs/promises';

export class FileTokenizer extends AbstractTokenizer {
interface IFileTokenizerOptions extends ITokenizerOptions {
/**
* Pass additional file information to the tokenizer
*/
fileInfo: IRandomAccessFileInfo;
}

export class FileTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer {

public constructor(private fileHandle: FileHandle, options: ITokenizerOptions) {
public fileInfo: IRandomAccessFileInfo;

/**
* Create tokenizer from provided file path
* @param sourceFilePath File path
*/
static async fromFile(sourceFilePath: string): Promise<FileTokenizer> {
const fileHandle = await fsOpen(sourceFilePath, 'r');
const stat = await fileHandle.stat();
return new FileTokenizer(fileHandle, {fileInfo: {path: sourceFilePath, size: stat.size}});
}

protected constructor(private fileHandle: FileHandle, options: IFileTokenizerOptions) {
super(options);
this.fileInfo = options.fileInfo;
}

/**
Expand Down Expand Up @@ -48,10 +68,14 @@ export class FileTokenizer extends AbstractTokenizer {
await this.fileHandle.close();
return super.close();
}
}

export async function fromFile(sourceFilePath: string): Promise<FileTokenizer> {
const fileHandle = await fsOpen(sourceFilePath, 'r');
const stat = await fileHandle.stat();
return new FileTokenizer(fileHandle, {fileInfo: {path: sourceFilePath, size: stat.size}});
setPosition(position: number): void {
this.position = position;
}

supportsRandomAccess(): boolean {
return true;
}
}


9 changes: 8 additions & 1 deletion lib/ReadStreamTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import { AbstractTokenizer } from './AbstractTokenizer.js';
import { EndOfStreamError, type IStreamReader } from 'peek-readable';
import type { IReadChunkOptions, ITokenizerOptions } from './types.js';
import type {IFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js';

const maxBufferSize = 256000;

export class ReadStreamTokenizer extends AbstractTokenizer {

public fileInfo: IFileInfo;

/**
* Constructor
* @param streamReader stream-reader to read from
* @param options Tokenizer options
*/
public constructor(private streamReader: IStreamReader, options?: ITokenizerOptions) {
super(options);
this.fileInfo = options?.fileInfo ?? {};
}

/**
Expand Down Expand Up @@ -102,4 +105,8 @@ export class ReadStreamTokenizer extends AbstractTokenizer {
public abort(): Promise<void> {
return this.streamReader.abort();
}

supportsRandomAccess(): boolean {
return false;
}
}
2 changes: 1 addition & 1 deletion lib/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { BufferTokenizer } from './BufferTokenizer.js';
import type { ITokenizerOptions } from './types.js';

export { EndOfStreamError, type AnyWebByteStream } from 'peek-readable';
export type { ITokenizer, IFileInfo, ITokenizerOptions, IReadChunkOptions, OnClose } from './types.js';
export type { ITokenizer, IRandomAccessTokenizer, IFileInfo, ITokenizerOptions, IReadChunkOptions, OnClose } from './types.js';
export type { IToken, IGetToken } from '@tokenizer/token';
export { AbstractTokenizer } from './AbstractTokenizer.js';

Expand Down
14 changes: 8 additions & 6 deletions lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ import type { Readable } from 'node:stream';
import type { ReadStreamTokenizer } from './ReadStreamTokenizer.js';
import { stat as fsStat } from 'node:fs/promises';
import { type ITokenizerOptions, fromStream as coreFromStream } from './core.js';
import {FileTokenizer} from "./FileTokenizer.js";

export { fromFile } from './FileTokenizer.js';
export { FileTokenizer } from './FileTokenizer.js';
export * from './core.js';
export type { IToken, IGetToken } from '@tokenizer/token';

Expand All @@ -22,12 +23,13 @@ interface StreamWithFile extends Readable {
* @returns Tokenizer
*/
export async function fromStream(stream: Readable, options?: ITokenizerOptions): Promise<ReadStreamTokenizer> {
const augmentedOptions: ITokenizerOptions = options ?? {};
augmentedOptions.fileInfo = augmentedOptions.fileInfo ?? {};
const rst = coreFromStream(stream, options);
if ((stream as StreamWithFile).path) {
const stat = await fsStat((stream as StreamWithFile).path as string);
augmentedOptions.fileInfo.path = (stream as StreamWithFile).path;
augmentedOptions.fileInfo.size = stat.size;
rst.fileInfo.path = (stream as StreamWithFile).path;
rst.fileInfo.size = stat.size;
}
return coreFromStream(stream, augmentedOptions);
return rst;
}

export const fromFile = FileTokenizer.fromFile;
34 changes: 30 additions & 4 deletions lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ export interface IFileInfo {
url?: string;
}

export interface IRandomAccessFileInfo extends IFileInfo {
/**
* File size in bytes
*/
size: number;
}

export interface IReadChunkOptions {

/**
Expand All @@ -36,16 +43,30 @@ export interface IReadChunkOptions {
/**
* Position where to begin reading from the file.
* Default it is `tokenizer.position`.
* Position may not be less then `tokenizer.position`.
* Position may not be less than `tokenizer.position`, unless `supportsRandomAccess()` returns `true`.
*/
position?: number;

/**
* If set, will not throw an EOF error if not all of the requested data could be read
* If set, will not throw an EOF error if not all off the requested data could be read
*/
mayBeLess?: boolean;
}

export interface IRandomAccessTokenizer extends ITokenizer {

/**
* Provide access to information of the underlying information stream or file.
*/
fileInfo: IRandomAccessFileInfo;

/**
* Change the position (offset) of the tokenizer
* @param position New position
*/
setPosition(position: number): void;
}

/**
* The tokenizer allows us to read or peek from the tokenizer-stream.
* The tokenizer-stream is an abstraction of a stream, file or Buffer.
Expand All @@ -55,12 +76,12 @@ export interface ITokenizer {
/**
* Provide access to information of the underlying information stream or file.
*/
fileInfo: IFileInfo;
readonly fileInfo: IFileInfo;

/**
* Offset in bytes (= number of bytes read) since beginning of file or stream
*/
position: number;
readonly position: number;

/**
* Peek (read ahead) buffer from tokenizer
Expand Down Expand Up @@ -123,6 +144,11 @@ export interface ITokenizer {
* Abort pending asynchronous operations
*/
abort(): Promise<void>;

/**
* Returns true when the underlying file supports random access
*/
supportsRandomAccess(): boolean;
}

export type OnClose = () => Promise<void>;
Expand Down
Binary file added test/resources/id3v1.mp3
Binary file not shown.
25 changes: 23 additions & 2 deletions test/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import { assert, expect, use } from 'chai';
import chaiAsPromised from 'chai-as-promised';
import { fromStream, fromWebStream, fromFile, fromBuffer, type ITokenizer } from '../lib/index.js';
import Path from 'node:path';
import { FileTokenizer } from '../lib/FileTokenizer.js';
import { EndOfStreamError } from 'peek-readable';

import mocha from 'mocha';
Expand Down Expand Up @@ -617,7 +616,7 @@ describe('Matrix tests', () => {

const rst = await tokenizerType.loadTokenizer('test1.dat');

if (rst instanceof FileTokenizer) {
if (rst.supportsRandomAccess()) {
assert.strictEqual(rst.fileInfo.size, 16, 'check file size property');
}
await peekOnData(rst);
Expand Down Expand Up @@ -993,5 +992,27 @@ it('should release stream after close', async () => {
assert.isFalse(stream.locked, 'stream is unlocked after closing tokenizer');
});

describe('Random-read-acccess', async () => {

it('Read ID3v1 header at the end of the file', async () => {

const tokenizer = await fromFile(getResourcePath('id3v1.mp3'));
try {
const id3HeaderSize = 128;
const id3Header = new Uint8Array(id3HeaderSize);
await tokenizer.readBuffer(id3Header,{position: tokenizer.fileInfo.size - id3HeaderSize});
const id3Tag = new TextDecoder('utf-8').decode(id3Header.subarray(0, 3));
assert.strictEqual(id3Tag, 'TAG');
assert.strictEqual(tokenizer.position, tokenizer.fileInfo.size, 'Tokenizer position should be at the end of the file');
tokenizer.setPosition(0);
assert.strictEqual(tokenizer.position, 0, 'Tokenizer position should be at the beginning of the file');
}
finally {
await tokenizer.close();
}
});

});



Loading