Skip to content

Commit

Permalink
Add extension for tokenizer for random access to media
Browse files Browse the repository at this point in the history
  • Loading branch information
Borewit committed Nov 26, 2024
1 parent 8b4f02b commit 15267d8
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 34 deletions.
19 changes: 10 additions & 9 deletions lib/AbstractTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,23 @@ interface INormalizedReadChunkOptions extends IReadChunkOptions {
*/
export abstract class AbstractTokenizer implements ITokenizer {

public fileInfo: IFileInfo;

private onClose?: OnClose;
private numBuffer = new Uint8Array(8);

public abstract fileInfo: IFileInfo;

/**
* Tokenizer-stream position
*/
public position = 0;


/**
* Constructor
* @param options Tokenizer options
* @protected
*/
protected constructor(options?: ITokenizerOptions) {
this.fileInfo = options?.fileInfo ?? {};
this.onClose = options?.onClose;
if (options?.abortSignal) {
options.abortSignal.addEventListener('abort', () => {
Expand All @@ -33,12 +39,7 @@ export abstract class AbstractTokenizer implements ITokenizer {
}
}

/**
* Tokenizer-stream position
*/
public position = 0;

private numBuffer = new Uint8Array(8);
abstract supportsRandomAccess(): boolean;

/**
* Read buffer from tokenizer
Expand Down
16 changes: 13 additions & 3 deletions lib/BufferTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import type { ITokenizerOptions, IReadChunkOptions } from './types.js';
import type {ITokenizerOptions, IReadChunkOptions, IRandomAccessFileInfo, IRandomAccessTokenizer} from './types.js';
import { EndOfStreamError } from 'peek-readable';
import { AbstractTokenizer } from './AbstractTokenizer.js';

export class BufferTokenizer extends AbstractTokenizer {
export class BufferTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer {

public fileInfo: IRandomAccessFileInfo;

/**
* Construct BufferTokenizer
Expand All @@ -11,7 +13,7 @@ export class BufferTokenizer extends AbstractTokenizer {
*/
constructor(private uint8Array: Uint8Array, options?: ITokenizerOptions) {
super(options);
this.fileInfo.size = this.fileInfo.size ? this.fileInfo.size : uint8Array.length;
this.fileInfo = {...options?.fileInfo ?? {}, ...{size: uint8Array.length}};
}

/**
Expand Down Expand Up @@ -55,4 +57,12 @@ export class BufferTokenizer extends AbstractTokenizer {
public close(): Promise<void> {
return super.close();
}

supportsRandomAccess(): boolean {
return true;
}

setPosition(position: number): void {
this.position = position;
}
}
40 changes: 32 additions & 8 deletions lib/FileTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
import { AbstractTokenizer } from './AbstractTokenizer.js';
import { EndOfStreamError } from 'peek-readable';
import type { IReadChunkOptions, ITokenizerOptions } from './types.js';
import type {IRandomAccessTokenizer, IRandomAccessFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js';
import { type FileHandle, open as fsOpen } from 'node:fs/promises';

export class FileTokenizer extends AbstractTokenizer {
interface IFileTokenizerOptions extends ITokenizerOptions {
/**
* Pass additional file information to the tokenizer
*/
fileInfo: IRandomAccessFileInfo;
}

export class FileTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer {

public constructor(private fileHandle: FileHandle, options: ITokenizerOptions) {
public fileInfo: IRandomAccessFileInfo;

/**
* Create tokenizer from provided file path
* @param sourceFilePath File path
*/
static async fromFile(sourceFilePath: string): Promise<FileTokenizer> {
const fileHandle = await fsOpen(sourceFilePath, 'r');
const stat = await fileHandle.stat();
return new FileTokenizer(fileHandle, {fileInfo: {path: sourceFilePath, size: stat.size}});
}

protected constructor(private fileHandle: FileHandle, options: IFileTokenizerOptions) {
super(options);
this.fileInfo = options.fileInfo;
}

/**
Expand Down Expand Up @@ -48,10 +68,14 @@ export class FileTokenizer extends AbstractTokenizer {
await this.fileHandle.close();
return super.close();
}
}

export async function fromFile(sourceFilePath: string): Promise<FileTokenizer> {
const fileHandle = await fsOpen(sourceFilePath, 'r');
const stat = await fileHandle.stat();
return new FileTokenizer(fileHandle, {fileInfo: {path: sourceFilePath, size: stat.size}});
setPosition(position: number): void {
this.position = position;
}

supportsRandomAccess(): boolean {
return true;
}
}


9 changes: 8 additions & 1 deletion lib/ReadStreamTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import { AbstractTokenizer } from './AbstractTokenizer.js';
import { EndOfStreamError, type IStreamReader } from 'peek-readable';
import type { IReadChunkOptions, ITokenizerOptions } from './types.js';
import type {IFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js';

const maxBufferSize = 256000;

export class ReadStreamTokenizer extends AbstractTokenizer {

public fileInfo: IFileInfo;

/**
* Constructor
* @param streamReader stream-reader to read from
* @param options Tokenizer options
*/
public constructor(private streamReader: IStreamReader, options?: ITokenizerOptions) {
super(options);
this.fileInfo = options?.fileInfo ?? {};
}

/**
Expand Down Expand Up @@ -102,4 +105,8 @@ export class ReadStreamTokenizer extends AbstractTokenizer {
public abort(): Promise<void> {
return this.streamReader.abort();
}

supportsRandomAccess(): boolean {
return false;
}
}
2 changes: 1 addition & 1 deletion lib/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { BufferTokenizer } from './BufferTokenizer.js';
import type { ITokenizerOptions } from './types.js';

export { EndOfStreamError, type AnyWebByteStream } from 'peek-readable';
export type { ITokenizer, IFileInfo, ITokenizerOptions, IReadChunkOptions, OnClose } from './types.js';
export type { ITokenizer, IRandomAccessTokenizer, IFileInfo, ITokenizerOptions, IReadChunkOptions, OnClose } from './types.js';
export type { IToken, IGetToken } from '@tokenizer/token';
export { AbstractTokenizer } from './AbstractTokenizer.js';

Expand Down
14 changes: 8 additions & 6 deletions lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ import type { Readable } from 'node:stream';
import type { ReadStreamTokenizer } from './ReadStreamTokenizer.js';
import { stat as fsStat } from 'node:fs/promises';
import { type ITokenizerOptions, fromStream as coreFromStream } from './core.js';
import {FileTokenizer} from "./FileTokenizer.js";

export { fromFile } from './FileTokenizer.js';
export { FileTokenizer } from './FileTokenizer.js';
export * from './core.js';
export type { IToken, IGetToken } from '@tokenizer/token';

Expand All @@ -22,12 +23,13 @@ interface StreamWithFile extends Readable {
* @returns Tokenizer
*/
export async function fromStream(stream: Readable, options?: ITokenizerOptions): Promise<ReadStreamTokenizer> {
const augmentedOptions: ITokenizerOptions = options ?? {};
augmentedOptions.fileInfo = augmentedOptions.fileInfo ?? {};
const rst = coreFromStream(stream, options);
if ((stream as StreamWithFile).path) {
const stat = await fsStat((stream as StreamWithFile).path as string);
augmentedOptions.fileInfo.path = (stream as StreamWithFile).path;
augmentedOptions.fileInfo.size = stat.size;
rst.fileInfo.path = (stream as StreamWithFile).path;
rst.fileInfo.size = stat.size;
}
return coreFromStream(stream, augmentedOptions);
return rst;
}

export const fromFile = FileTokenizer.fromFile;
34 changes: 30 additions & 4 deletions lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ export interface IFileInfo {
url?: string;
}

export interface IRandomAccessFileInfo extends IFileInfo {
/**
* File size in bytes
*/
size: number;
}

export interface IReadChunkOptions {

/**
Expand All @@ -36,16 +43,30 @@ export interface IReadChunkOptions {
/**
* Position where to begin reading from the file.
* Default it is `tokenizer.position`.
* Position may not be less then `tokenizer.position`.
* Position may not be less than `tokenizer.position`, unless `supportsRandomAccess()` returns `true`.
*/
position?: number;

/**
* If set, will not throw an EOF error if not all of the requested data could be read
* If set, will not throw an EOF error if not all off the requested data could be read
*/
mayBeLess?: boolean;
}

export interface IRandomAccessTokenizer extends ITokenizer {

/**
* Provide access to information of the underlying information stream or file.
*/
fileInfo: IRandomAccessFileInfo;

/**
* Change the position (offset) of the tokenizer
* @param position New position
*/
setPosition(position: number): void;
}

/**
* The tokenizer allows us to read or peek from the tokenizer-stream.
* The tokenizer-stream is an abstraction of a stream, file or Buffer.
Expand All @@ -55,12 +76,12 @@ export interface ITokenizer {
/**
* Provide access to information of the underlying information stream or file.
*/
fileInfo: IFileInfo;
readonly fileInfo: IFileInfo;

/**
* Offset in bytes (= number of bytes read) since beginning of file or stream
*/
position: number;
readonly position: number;

/**
* Peek (read ahead) buffer from tokenizer
Expand Down Expand Up @@ -123,6 +144,11 @@ export interface ITokenizer {
* Abort pending asynchronous operations
*/
abort(): Promise<void>;

/**
* Returns true when the underlying file supports random access
*/
supportsRandomAccess(): boolean;
}

export type OnClose = () => Promise<void>;
Expand Down
Binary file added test/resources/id3v1.mp3
Binary file not shown.
25 changes: 23 additions & 2 deletions test/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import { assert, expect, use } from 'chai';
import chaiAsPromised from 'chai-as-promised';
import { fromStream, fromWebStream, fromFile, fromBuffer, type ITokenizer } from '../lib/index.js';
import Path from 'node:path';
import { FileTokenizer } from '../lib/FileTokenizer.js';
import { EndOfStreamError } from 'peek-readable';

import mocha from 'mocha';
Expand Down Expand Up @@ -617,7 +616,7 @@ describe('Matrix tests', () => {

const rst = await tokenizerType.loadTokenizer('test1.dat');

if (rst instanceof FileTokenizer) {
if (rst.supportsRandomAccess()) {
assert.strictEqual(rst.fileInfo.size, 16, 'check file size property');
}
await peekOnData(rst);
Expand Down Expand Up @@ -993,5 +992,27 @@ it('should release stream after close', async () => {
assert.isFalse(stream.locked, 'stream is unlocked after closing tokenizer');
});

describe('Random-read-acccess', async () => {

it('Read ID3v1 header at the end of the file', async () => {

const tokenizer = await fromFile(getResourcePath('id3v1.mp3'));
try {
const id3HeaderSize = 128;
const id3Header = new Uint8Array(id3HeaderSize);
await tokenizer.readBuffer(id3Header,{position: tokenizer.fileInfo.size - id3HeaderSize});
const id3Tag = new TextDecoder('utf-8').decode(id3Header.subarray(0, 3));
assert.strictEqual(id3Tag, 'TAG');
assert.strictEqual(tokenizer.position, tokenizer.fileInfo.size, 'Tokenizer position should be at the end of the file');
tokenizer.setPosition(0);
assert.strictEqual(tokenizer.position, 0, 'Tokenizer position should be at the beginning of the file');
}
finally {
await tokenizer.close();
}
});

});



0 comments on commit 15267d8

Please sign in to comment.