diff --git a/biome.json b/biome.json index 96f8afcd..5eee1544 100644 --- a/biome.json +++ b/biome.json @@ -97,6 +97,9 @@ }, "complexity": { "useLiteralKeys": "off" + }, + "suspicious": { + "noMisplacedAssertion": "off" } } } diff --git a/playground/case_when.html b/playground/case_when.html new file mode 100644 index 00000000..46e4fe92 --- /dev/null +++ b/playground/case_when.html @@ -0,0 +1,434 @@ + + + + + + tsb — case_when + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

case_when

+

Conditional value selection using CASE WHEN semantics — mirrors pandas.Series.case_when() (pandas 2.2+).

+ +
+

1 — Basic grade classification

+

caseWhen(series, caselist) applies an ordered list of [condition, replacement] pairs. The first matching condition determines the output; if no condition matches the original value is kept.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

2 — Using boolean Series as conditions

+

Conditions can be boolean Series objects (e.g. from comparison operations).

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

3 — Using predicate functions

+

Conditions can be predicate functions (value, index) => boolean.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

4 — Series as replacement values

+

Replacements can be Series objects — the matching positional value is used.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

5 — Unmatched rows keep original values

+

Any row not matched by any condition retains its original value — there is no implicit "else" replacement.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

6 — First matching condition wins

+

When multiple conditions match the same row, the first one in caselist takes effect — just like CASE WHEN … THEN … WHEN … THEN … END in SQL.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

7 — Positional index in predicate

+

Predicate functions receive both the value and its positional index as the second argument.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

8 — String Series classification

+

caseWhen works on any Series type — numbers, strings, booleans, or mixed.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ +
+

9 — Comparison with where / mask

+

caseWhen generalises whereSeries to multiple branches. Use whereSeries for a single condition; use caseWhen for multi-branch logic.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + + + + + diff --git a/playground/flags.html b/playground/flags.html new file mode 100644 index 00000000..18c8cbf6 --- /dev/null +++ b/playground/flags.html @@ -0,0 +1,300 @@ + + + + + + tsb — Flags: metadata for DataFrame and Series + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

Flags: metadata for DataFrame and Series

+

+ Mirrors + pandas.DataFrame.flags — controls duplicate-label behaviour. +

+ + +
+

1 · Default flags

+

+ Every DataFrame and Series exposes a + flags getter returning a Flags object. + By default, allowsDuplicateLabels is true. +

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Setting flags

+

+ Mutate allowsDuplicateLabels directly on the + Flags object. The change is shared across all + Flags wrappers for the same underlying object. +

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · DuplicateLabelError

+

+ Setting allowsDuplicateLabels = false on an object with + duplicate index labels immediately throws a + DuplicateLabelError. +

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · copy() and raiseOnDuplicates()

+

+ Flags.copy() returns a new wrapper sharing the same state. + raiseOnDuplicates() validates only when + allowsDuplicateLabels is false. +

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + + + + + diff --git a/playground/index.html b/playground/index.html index 1de4cd2e..82fd3ebb 100644 --- a/playground/index.html +++ b/playground/index.html @@ -235,6 +235,11 @@

Wide-to-long reshape. Unpivot columns into variable/value pairs with id_vars, value_vars, var_name, value_name.

✅ Complete
+
+

↕ lreshape

+

Wide-to-long reshape with named column groups. Stack multiple wide columns into long columns with explicit grouping, dropna support.

+
✅ Complete
+

🔄 pivot & pivotTable

Reshape with aggregation. pivot() for unique reshaping; pivotTable() for aggregation (mean/sum/count/min/max/first/last) with fill_value and dropna support.

@@ -330,6 +335,11 @@

Attach arbitrary key→value metadata to any Series or DataFrame via a WeakMap registry. Provides getAttrs, setAttrs, updateAttrs, copyAttrs, withAttrs, mergeAttrs, clearAttrs, getAttr, setAttr, deleteAttr, attrsCount, attrsKeys. Mirrors pandas.DataFrame.attrs / pandas.Series.attrs.

✅ Complete

+
+

🚩 flags — Metadata Flags

+

Metadata flags for DataFrame and Series. The flags getter returns a Flags object with allowsDuplicateLabels property. Setting allowsDuplicateLabels = false on an object with duplicate index labels raises DuplicateLabelError. Mirrors pandas.DataFrame.flags / pandas.core.flags.Flags.

+
✅ Complete
+

🔤 string_ops — Standalone String Ops

Module-level string utilities: strNormalize (Unicode NFC/NFD/NFKC/NFKD), strGetDummies (one-hot DataFrame), strExtractAll (all regex matches), strRemovePrefix, strRemoveSuffix, strTranslate (char-level substitution), strCharWidth (CJK-aware display width), strByteLength. Works on Series, arrays, or scalars.

@@ -501,6 +511,31 @@

✅ Complete

+
+

📄 readXml / toXml — pd.read_xml() / DataFrame.to_xml()

+

readXml(text, opts?) / toXml(df, opts?) — parse XML into DataFrames and serialize back. rowTag auto-detection, attributes, CDATA, entities, namespaces, usecols, nrows, indexCol. Mirrors pandas.read_xml() / DataFrame.to_xml().

+
✅ Complete
+
+
+

📋 readTable — pd.read_table()

+

readTable(text, opts?) — parse delimiter-separated text into a DataFrame. Defaults to tab separator; all ReadCsvOptions forwarded. Mirrors pandas.read_table().

+
✅ Complete
+
+
+

🗄️ SQL I/O — pd.read_sql() / DataFrame.to_sql()

+

readSql / readSqlQuery / readSqlTable / toSql — adapter-based SQL I/O. Bring your own DB driver; zero runtime dependencies. Mirrors pandas.read_sql(), read_sql_query(), read_sql_table(), DataFrame.to_sql().

+
✅ Complete
+
+
+

📊 readStata & toStata — pd.read_stata() / DataFrame.to_stata()

+

readStata / toStata — Stata DTA binary file I/O. Supports reading v114/115 (old binary) and v117/118/119 (new XML-tagged) formats; writes v118. Missing values, string columns, value labels (convertCategoricals). Mirrors pandas.read_stata(), DataFrame.to_stata().

+
✅ Complete
+
+
+

🔀 case_when — pd.Series.case_when()

+

caseWhen(series, caselist) — conditional value selection using ordered CASE WHEN semantics. Mirrors pandas.Series.case_when() (pandas 2.2+).

+
✅ Complete
+
diff --git a/playground/lreshape.html b/playground/lreshape.html new file mode 100644 index 00000000..3f434a11 --- /dev/null +++ b/playground/lreshape.html @@ -0,0 +1,327 @@ + + + + + + tsb — lreshape + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

↕ lreshape — Interactive Playground

+

Reshape wide-format data to long format using named column groups — + mirrors pandas.lreshape().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic lreshape

+

Stack two wide columns (v1, v2) into a single long + column v, repeating the id column for each block.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Multiple groups

+

Reshape with multiple output columns simultaneously. Each output column is + fed from a separate list of input columns.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · dropna option

+

By default rows where any value column is null/NaN + are dropped. Pass dropna: false to keep them.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Real-world: survey scores

+

Stack multiple rounds of survey scores into a long-format table.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

Reshape wide-format data to long format by explicitly naming which input + columns map to each output column.

+
lreshape(
+  data: DataFrame,
+  groups: Record<string, string[]>,  // { outputCol: [inputCol1, inputCol2, ...] }
+  options?: {
+    dropna?: boolean,  // drop rows with null/NaN values (default: true)
+  }
+): DataFrame
+

All input columns not mentioned in groups + become identity (id) columns and are repeated for each block. All group lists must + have the same length k; the result has nRows × k rows + (before applying dropna).

+
+ + + + + diff --git a/playground/read_table.html b/playground/read_table.html new file mode 100644 index 00000000..550913b8 --- /dev/null +++ b/playground/read_table.html @@ -0,0 +1,367 @@ + + + + + + tsb — readTable + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📋 readTable — Interactive Playground

+

+ Parse delimiter-separated text into a DataFrame + with readTable(). Mirrors + pandas + read_table() — identical to readCsv() but defaults + to a tab (\t) separator.
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic tab-separated file

+

By default readTable() splits on tabs, infers column dtypes, + and returns a DataFrame.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · Custom separator

+

Pass sep to use any delimiter — pipe, semicolon, or + multi-character strings.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · Handling missing values

+

readTable() recognises common NA strings (NA, + N/A, null, …) and converts them to + NaN. Extend the list with naValues.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · Index column, row limits & skip rows

+

Use indexCol to promote a column to the row index. + nRows caps the number of data rows read; skipRows + skips rows after the header.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

Parse a delimiter-separated text string into a DataFrame. + Defaults to tab (\t) unlike readCsv which uses + a comma.

+
readTable(text: string, options?: ReadTableOptions): DataFrame
+
+interface ReadTableOptions {
+  sep?:      string;                     // separator (default: "\t")
+  header?:   number | null;              // header row index (default: 0)
+  indexCol?: string | number | null;     // column to use as row index
+  dtype?:    Record<string, DtypeName>; // force dtype for named columns
+  naValues?: readonly string[];          // extra NA string values
+  skipRows?: number;                     // data rows to skip after header
+  nRows?:    number;                     // maximum data rows to read
+}
+
+ + + + + diff --git a/playground/sql.html b/playground/sql.html new file mode 100644 index 00000000..8c28d1f6 --- /dev/null +++ b/playground/sql.html @@ -0,0 +1,476 @@ + + + + + + tsb — SQL I/O + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

🗃️ SQL I/O — Interactive Playground

+

+ readSql, readSqlQuery, readSqlTable, and toSql + mirror pandas + read_sql() and + DataFrame.to_sql(). + Because tsb has zero runtime dependencies, you pass + a SqlConnection adapter for your database driver. + Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · readSqlQuery — run a SELECT statement

+

Pass a SQL string and a SqlConnection adapter. The result is a + DataFrame. An optional indexCol promotes a column to the row + index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

2 · readSqlTable — load an entire table

+

Pass a table name (not a SQL string). Use columns to select a subset, + or indexCol to set the row index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

3 · readSql — auto-detect query vs table name

+

readSql inspects the first argument: if it looks like a SQL statement + it calls readSqlQuery; otherwise it calls readSqlTable.

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

4 · toSql — write a DataFrame to a SQL table

+

Writes rows from a DataFrame into the database. Returns the number of + rows written. The ifExists option controls what happens when the table + already exists: "fail", "replace", or + "append".

+
+
+ TypeScript +
+ + +
+
+ + +
Click ▶ Run to execute
+
Ctrl+Enter to run · Tab to indent
+
+
+ + +
+

API Reference

+

All four functions accept a SqlConnection adapter — implement + query() plus optional listTables() and insert() + for your database driver.

+
interface SqlConnection {
+  query(sql: string, params?: readonly SqlValue[]): SqlResult;
+  listTables?(): string[];
+  insert?(table: string, rows: object[], columns: string[], ifExists: IfExistsOption): number;
+}
+
+readSqlQuery(sql: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSqlTable(table: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSql(sqlOrTable: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+toSql(df: DataFrame, name: string, con: SqlConnection, options?: ToSqlOptions): number
+
+interface ReadSqlOptions {
+  indexCol?: string | string[];
+  columns?:  string[];
+  params?:   readonly SqlValue[];
+  parseDates?: string[];
+}
+
+interface ToSqlOptions {
+  ifExists?: "fail" | "replace" | "append";  // default: "fail"
+  index?:    boolean;                          // include index column (default: true)
+  chunkSize?: number;
+}
+
+ + + + + diff --git a/playground/stata.html b/playground/stata.html new file mode 100644 index 00000000..18743f45 --- /dev/null +++ b/playground/stata.html @@ -0,0 +1,379 @@ + + + + + + tsb — readStata & toStata + + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📊 readStata & toStata — Interactive Playground

+

Read and write Stata DTA files from TypeScript. + toStata(df) serializes a DataFrame to a Stata DTA v118 binary buffer. + readStata(buf, options) parses the buffer back into a DataFrame. + Numeric missing values are represented as null. Mirrors + pandas.read_stata() and DataFrame.to_stata().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic round-trip — write and read back

+

Create a DataFrame, serialize it to a Stata DTA v118 binary buffer with + toStata(), then parse it back with readStata(). + All columns, values, and shape are preserved.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

2 · Missing values — null round-trip

+

Stata represents missing numeric values as special sentinel bit patterns. + readStata maps all missing sentinels to null. + toStata writes the standard Stata system-missing value for each type.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

3 · Options — dataLabel & variableLabels

+

Embed a dataset description with dataLabel and per-column annotations + with variableLabels. These metadata fields are stored in the DTA header + and are visible in Stata's describe command.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

4 · Options — usecols, nRows, indexCol

+

Restrict columns with usecols, limit rows with nRows, + and promote a column to the DataFrame index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

5 · Boolean columns

+

Boolean values are stored as Stata byte (int8) with + true → 1 and false → 0. Reading converts + them back to numbers; use .map() or comparison operators + to recover booleans if needed.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

6 · writeIndex — include the row index

+

Pass writeIndex: true to include the DataFrame's row index + as an extra _index column in the DTA file.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + + + diff --git a/playground/xml.html b/playground/xml.html new file mode 100644 index 00000000..23e2e96d --- /dev/null +++ b/playground/xml.html @@ -0,0 +1,462 @@ + + + + + + tsb — readXml & toXml + + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

📄 readXml & toXml — Interactive Playground

+

Parse XML text into a DataFrame with + auto-detection of row elements, attribute and child-element columns, entity decoding, + CDATA support, namespace stripping, and numeric coercion. Serialize any DataFrame + back to well-formed XML with full formatting control. Mirrors + pandas.read_xml() and pandas.DataFrame.to_xml().
+ Edit any code block below and press ▶ Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 · Basic readXml — child-element rows

+

The most common XML layout: a root element containing repeating row elements, + each with child elements as columns. readXml auto-detects the row + tag and coerces numeric strings automatically.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

2 · Attribute rows

+

XML elements can carry data as attributes instead of (or in addition to) child + elements. Use attribs: true (the default) to include them.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

3 · usecols, nrows, indexCol

+

Restrict the columns returned with usecols, limit rows with + nrows, and promote a column to the index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

4 · naValues — custom NA strings

+

Built-in NA strings include "", "NA", "NaN", + "N/A", "null", "None", "nan". + Use naValues to add your own.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

5 · Entities & CDATA

+

Named entities (&amp;, &lt;, …), decimal/hex + character references (&#65;, &#x41;), and + CDATA sections (<![CDATA[…]]>) are all handled transparently.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

6 · toXml — child elements (default)

+

toXml(df) produces a well-formed XML document with an XML declaration, + a configurable root element, and one child element per row containing one sub-element + per column.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

7 · toXml — attribs mode

+

Set attribs: true to emit column values as XML attributes on each + row element instead of as child elements — produces more compact output.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

8 · toXml — namespaces & CDATA columns

+

Declare XML namespace prefixes on the root element with namespaces. + Wrap sensitive columns in CDATA sections with cdataCols to preserve + special characters literally.

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + +
+

9 · Round-trip: toXml → readXml

+

Serializing a DataFrame to XML and reading it back should produce an identical + DataFrame (shape and values).

+
+
+ TypeScript +
+ + +
+
+ +
Click ▶ Run to execute
+
+
+ + + + diff --git a/src/core/flags.ts b/src/core/flags.ts new file mode 100644 index 00000000..546cb031 --- /dev/null +++ b/src/core/flags.ts @@ -0,0 +1,186 @@ +/** + * Flags — metadata flags for DataFrame and Series objects. + * + * Mirrors `pandas.core.flags.Flags`. Provides the `allowsDuplicateLabels` + * flag that controls whether duplicate row/column labels are permitted in the + * associated DataFrame or Series. + * + * @example + * ```ts + * import { DataFrame, DuplicateLabelError } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + * df.flags.allowsDuplicateLabels; // true (default) + * + * df.flags.allowsDuplicateLabels = false; + * // Setting false on a DataFrame with no duplicates is fine. + * + * const dfDup = new DataFrame( + * new Map([["a", df.col("a")]]), + * df.index.append(df.index), // duplicate index + * ); + * dfDup.flags.allowsDuplicateLabels = false; // throws DuplicateLabelError + * ``` + * + * @packageDocumentation + */ + +import { DuplicateLabelError } from "../errors.ts"; + +// --------------------------------------------------------------------------- +// Structural interfaces (no imports from frame.ts / series.ts) +// --------------------------------------------------------------------------- + +/** + * Minimal structural interface satisfied by any `Index` instance. + * Defined here (instead of importing from base-index.ts) to avoid circular + * imports — frame.ts → flags.ts must not require flags.ts → frame.ts. + */ +interface IndexLike { + readonly values: readonly unknown[]; + readonly size: number; +} + +/** + * Structural interface satisfied by both `DataFrame` and `Series`. + * Used as the WeakMap key so flags.ts never imports the concrete classes. + */ +export interface FlaggedObject { + /** Row index of the object. */ + readonly index: IndexLike; +} + +// --------------------------------------------------------------------------- +// Internal state registry +// --------------------------------------------------------------------------- + +interface FlagsState { + allowsDuplicateLabels: boolean; +} + +const registry = new WeakMap(); + +function getState(obj: FlaggedObject): FlagsState { + let state = registry.get(obj); + if (state === undefined) { + state = { allowsDuplicateLabels: true }; + registry.set(obj, state); + } + return state; +} + +// --------------------------------------------------------------------------- +// Flags class +// --------------------------------------------------------------------------- + +/** + * Metadata flags for a `DataFrame` or `Series`. + * + * Accessible via `df.flags` or `series.flags`. Mutations are reflected + * immediately on the underlying object because state is stored in a + * module-level WeakMap keyed by the object reference. + * + * ### pandas reference + * `pandas.core.flags.Flags` + */ +export class Flags { + private readonly _obj: FlaggedObject; + + /** + * @param obj - The DataFrame or Series this Flags object is bound to. + * @param opts.allowsDuplicateLabels - Initial value for `allowsDuplicateLabels`. + * Defaults to `true` when not previously set. + */ + constructor(obj: FlaggedObject, opts: { allowsDuplicateLabels?: boolean } = {}) { + this._obj = obj; + if (opts.allowsDuplicateLabels !== undefined) { + getState(obj).allowsDuplicateLabels = opts.allowsDuplicateLabels; + } + } + + // ── allowsDuplicateLabels ───────────────────────────────────────────────── + + /** + * Whether duplicate labels (along any axis) are allowed. + * + * Defaults to `true`. When set to `false`, any existing duplicate labels + * trigger a `DuplicateLabelError` immediately. Future operations that would + * produce duplicate labels also raise. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true + * df.flags.allowsDuplicateLabels = false; + * df.flags.allowsDuplicateLabels; // false + * ``` + */ + get allowsDuplicateLabels(): boolean { + return getState(this._obj).allowsDuplicateLabels; + } + + set allowsDuplicateLabels(value: boolean) { + getState(this._obj).allowsDuplicateLabels = value; + if (!value) { + this._validateNoDuplicates(); + } + } + + // ── helpers ─────────────────────────────────────────────────────────────── + + /** + * Raise `DuplicateLabelError` if the bound object currently has duplicate + * row-index labels. + */ + private _validateNoDuplicates(): void { + const { values } = this._obj.index; + const seen = new Set(); + for (const label of values) { + if (seen.has(label)) { + throw new DuplicateLabelError(`Index has duplicate keys: [${String(label)}]`); + } + seen.add(label); + } + } + + /** + * Raise `DuplicateLabelError` if `allowsDuplicateLabels` is `false` and + * the bound object has duplicate labels. Called by DataFrame/Series methods + * after operations that could introduce duplicates. + */ + raiseOnDuplicates(): void { + if (!this.allowsDuplicateLabels) { + this._validateNoDuplicates(); + } + } + + /** + * Return a copy of this Flags object bound to the **same** underlying object. + * + * The returned `Flags` shares state with the original — mutations to either + * are reflected in both (they both write to the same WeakMap entry). + */ + copy(): Flags { + return new Flags(this._obj); + } + + /** Human-readable representation mirroring pandas' `repr(df.flags)`. */ + toString(): string { + return ``; + } +} + +// --------------------------------------------------------------------------- +// Registry accessor (used by DataFrame.flags / Series.flags getters) +// --------------------------------------------------------------------------- + +/** + * Return (or lazily create) the `Flags` wrapper for the given object. + * + * Each call creates a *new* `Flags` wrapper object, but all wrappers for the + * same `obj` share the same state via the module-level WeakMap registry. + * + * @param obj - The DataFrame or Series to get flags for. + */ +export function getFlags(obj: FlaggedObject): Flags { + return new Flags(obj); +} diff --git a/src/core/frame.ts b/src/core/frame.ts index ec18d144..e21c341e 100644 --- a/src/core/frame.ts +++ b/src/core/frame.ts @@ -26,6 +26,8 @@ import type { ExpandingOptions } from "../window/index.ts"; import { Rolling } from "../window/index.ts"; import type { RollingOptions } from "../window/index.ts"; import { Index } from "./base-index.ts"; +import { getFlags } from "./flags.ts"; +import type { Flags } from "./flags.ts"; import { RangeIndex } from "./range-index.ts"; import { Series } from "./series.ts"; @@ -245,6 +247,21 @@ export class DataFrame { return this.index.size === 0 || this.columns.size === 0; } + /** + * Metadata flags for this DataFrame. + * + * Controls behaviour such as whether duplicate labels are allowed. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true (default) + * df.flags.allowsDuplicateLabels = false; + * ``` + */ + get flags(): Flags { + return getFlags(this); + } + // ─── column access ──────────────────────────────────────────────────────── /** diff --git a/src/core/index.ts b/src/core/index.ts index 130c748e..2ac9ba64 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -151,3 +151,6 @@ export type { ExtensionDtypeConstructor, ExtensionArrayConstructor, } from "./extensions.ts"; + +export { Flags, getFlags } from "./flags.ts"; +export type { FlaggedObject } from "./flags.ts"; diff --git a/src/core/series.ts b/src/core/series.ts index 29063e91..03815a8b 100644 --- a/src/core/series.ts +++ b/src/core/series.ts @@ -21,6 +21,8 @@ import type { CatSeriesLike } from "./cat_accessor.ts"; import { DatetimeAccessor } from "./datetime_accessor.ts"; import type { DatetimeSeriesLike } from "./datetime_accessor.ts"; import { Dtype } from "./dtype.ts"; +import { getFlags } from "./flags.ts"; +import type { Flags } from "./flags.ts"; import { RangeIndex } from "./range-index.ts"; import { StringAccessor } from "./string_accessor.ts"; import type { StringSeriesLike } from "./string_accessor.ts"; @@ -286,6 +288,21 @@ export class Series { return this._values.length === 0; } + /** + * Metadata flags for this Series. + * + * Controls behaviour such as whether duplicate labels are allowed. + * + * @example + * ```ts + * s.flags.allowsDuplicateLabels; // true (default) + * s.flags.allowsDuplicateLabels = false; + * ``` + */ + get flags(): Flags { + return getFlags(this); + } + /** Snapshot of the underlying values as a plain array. */ get values(): readonly T[] { return this._values; diff --git a/src/errors.ts b/src/errors.ts index 4ea24681..83099389 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -86,6 +86,19 @@ export class EmptyDataError extends Error { } } +/** + * Raised when an operation would produce (or encounters) duplicate labels + * on an object where `flags.allowsDuplicateLabels` is `false`. + * + * Equivalent to `pandas.errors.DuplicateLabelError`. + */ +export class DuplicateLabelError extends ValueError { + override readonly name = "DuplicateLabelError"; + constructor(message = "Index has duplicates") { + super(message); + } +} + /** Raised when casting to integer would lose data due to NaN values. */ export class IntCastingNaNError extends Error { override readonly name = "IntCastingNaNError"; @@ -233,6 +246,7 @@ export const errors = { DatabaseError, DataError, DtypeWarning, + DuplicateLabelError, EmptyDataError, IntCastingNaNError, InvalidColumnName, diff --git a/src/index.ts b/src/index.ts index 2f49842f..d0048033 100644 --- a/src/index.ts +++ b/src/index.ts @@ -62,6 +62,26 @@ export { toJsonDenormalize, toJsonRecords, toJsonSplit, toJsonIndex } from "./io export type { JsonDenormalizeOptions, JsonSplitOptions, JsonSplitResult } from "./io/index.ts"; export { readHtml } from "./io/index.ts"; export type { ReadHtmlOptions } from "./io/index.ts"; +export { readXml, toXml } from "./io/index.ts"; +export type { ReadXmlOptions, ToXmlOptions } from "./io/index.ts"; +export { readTable } from "./io/index.ts"; +export type { ReadTableOptions } from "./io/index.ts"; +export { readSql, readSqlQuery, readSqlTable, toSql } from "./io/index.ts"; +export { TableExistsError, TableNotFoundError } from "./io/index.ts"; +export { readStata, toStata } from "./io/index.ts"; +export type { ReadStataOptions, ToStataOptions } from "./io/index.ts"; +export type { + SqlValue, + SqlRow, + SqlResult, + SqlConnection, + IfExistsStrategy, + ReadSqlBaseOptions, + ReadSqlQueryOptions, + ReadSqlTableOptions, + ReadSqlOptions, + ToSqlOptions, +} from "./io/index.ts"; export { pearsonCorr, dataFrameCorr, dataFrameCov } from "./stats/index.ts"; export type { CorrMethod, CorrOptions, CovOptions } from "./stats/index.ts"; export { Rolling } from "./window/index.ts"; @@ -103,6 +123,8 @@ export { wideToLong } from "./reshape/index.ts"; export type { WideToLongOptions } from "./reshape/index.ts"; export { pivotTableFull } from "./reshape/index.ts"; export type { PivotTableFullOptions } from "./reshape/index.ts"; +export { lreshape } from "./reshape/index.ts"; +export type { LreshapeGroups, LreshapeOptions } from "./reshape/index.ts"; export { MultiIndex } from "./core/index.ts"; export type { MultiIndexOptions } from "./core/index.ts"; export { rankSeries, rankDataFrame } from "./stats/index.ts"; @@ -783,3 +805,8 @@ export { IndexError, } from "./errors.ts"; export type { PandasError } from "./errors.ts"; +export { DuplicateLabelError } from "./errors.ts"; +export { caseWhen } from "./stats/index.ts"; +export type { CaseWhenBranch, CaseWhenPredicate } from "./stats/index.ts"; +export { Flags, getFlags } from "./core/index.ts"; +export type { FlaggedObject } from "./core/index.ts"; diff --git a/src/io/csv.ts b/src/io/csv.ts index 687355f0..331ee944 100644 --- a/src/io/csv.ts +++ b/src/io/csv.ts @@ -144,6 +144,7 @@ function isNaRaw(raw: string, naSet: ReadonlySet): boolean { /** Infer the most specific dtype for a column from its raw string values. */ function inferColumnDtype(raws: readonly string[], naSet: ReadonlySet): DtypeName { const nonNa = raws.filter((r) => !isNaRaw(r, naSet)); + const hasNa = nonNa.length < raws.length; if (nonNa.length === 0) { return "object"; } @@ -153,18 +154,23 @@ function inferColumnDtype(raws: readonly string[], naSet: ReadonlySet): } const allInt = nonNa.every((r) => RE_INT.test(r)); if (allInt) { - return "int64"; + // Upgrade to float64 when NAs are present so NaN can represent missing values. + return hasNa ? "float64" : "int64"; } const allFloat = nonNa.every((r) => RE_FLOAT.test(r)); if (allFloat) { return "float64"; } - return "string"; + return "object"; } /** Parse a raw string to a Scalar for an inferred dtype. */ function parseInferred(raw: string, dtype: DtypeName, naSet: ReadonlySet): Scalar { if (isNaRaw(raw, naSet)) { + // Numeric columns use NaN so callers can detect missing values via Number.isNaN(). + if (dtype === "float64" || dtype === "int64") { + return Number.NaN; + } return null; } if (dtype === "bool") { diff --git a/src/io/index.ts b/src/io/index.ts index 6c5edea0..93f3060d 100644 --- a/src/io/index.ts +++ b/src/io/index.ts @@ -23,6 +23,28 @@ export type { } from "./to_json_normalize.ts"; export { readHtml } from "./read_html.ts"; export type { ReadHtmlOptions } from "./read_html.ts"; +export { readXml, toXml } from "./xml.ts"; +export type { ReadXmlOptions, ToXmlOptions } from "./xml.ts"; +export { readTable } from "./read_table.ts"; +export type { ReadTableOptions } from "./read_table.ts"; + +export { readSql, readSqlQuery, readSqlTable, toSql } from "./sql.ts"; +export { TableExistsError, TableNotFoundError } from "./sql.ts"; + +export { readStata, toStata } from "./stata.ts"; +export type { ReadStataOptions, ToStataOptions } from "./stata.ts"; +export type { + SqlValue, + SqlRow, + SqlResult, + SqlConnection, + IfExistsStrategy, + ReadSqlBaseOptions, + ReadSqlQueryOptions, + ReadSqlTableOptions, + ReadSqlOptions, + ToSqlOptions, +} from "./sql.ts"; // readExcel / xlsxSheetNames use node:zlib and cannot be bundled for the // browser. Import them directly from "tsb/io/read_excel" when running in diff --git a/src/io/read_table.ts b/src/io/read_table.ts new file mode 100644 index 00000000..0290afa1 --- /dev/null +++ b/src/io/read_table.ts @@ -0,0 +1,52 @@ +/** + * readTable — read a general delimiter-separated text file into a DataFrame. + * + * Mirrors `pandas.read_table()`: + * - Same signature as `readCsv` but defaults `sep` to `"\t"`. + * - Handles any single-character (or multi-character) delimiter. + * - All `ReadCsvOptions` are supported; when `sep` is omitted it falls back + * to `"\t"` (tab), distinguishing this function from `readCsv` (whose + * default is `","`). + * + * @module + */ + +import type { DataFrame } from "../core/index.ts"; +import { readCsv } from "./csv.ts"; +import type { ReadCsvOptions } from "./csv.ts"; + +// ─── public types ───────────────────────────────────────────────────────────── + +/** + * Options for {@link readTable}. + * + * Identical to {@link ReadCsvOptions} except the default `sep` is `"\t"`. + */ +export interface ReadTableOptions extends ReadCsvOptions { + /** Column separator. Default: `"\t"` (tab). */ + readonly sep?: string; +} + +// ─── implementation ─────────────────────────────────────────────────────────── + +/** + * Parse a delimiter-separated text string into a {@link DataFrame}. + * + * Equivalent to `pandas.read_table()` — the same as {@link readCsv} but + * defaults to a tab separator instead of a comma. + * + * ```ts + * import { readTable } from "tsb"; + * + * const tsv = "name\tage\tscity\nAlice\t30\tNY\nBob\t25\tLA"; + * const df = readTable(tsv); + * // DataFrame with columns: name, age, city + * ``` + * + * @param text Raw text content of the file. + * @param options Parsing options (see {@link ReadTableOptions}). + */ +export function readTable(text: string, options: ReadTableOptions = {}): DataFrame { + const sep = options.sep ?? "\t"; + return readCsv(text, { ...options, sep }); +} diff --git a/src/io/sql.ts b/src/io/sql.ts new file mode 100644 index 00000000..2e5ace04 --- /dev/null +++ b/src/io/sql.ts @@ -0,0 +1,654 @@ +/** + * read_sql / to_sql — SQL I/O for DataFrame. + * + * Mirrors the pandas SQL I/O API: + * - {@link readSqlQuery} — execute a SQL SELECT and return a DataFrame + * - {@link readSqlTable} — read an entire table into a DataFrame + * - {@link readSql} — auto-detect query vs table name + * - {@link toSql} — write a DataFrame to a SQL table + * + * Because tsb has zero runtime dependencies, this module does **not** ship a + * database driver. Instead it defines the {@link SqlConnection} adapter + * interface. Pass a conforming adapter for your driver of choice + * (better-sqlite3, postgres, mysql2, …) to any of the functions here. + * + * @example + * ```ts + * import type { SqlConnection, SqlResult, SqlValue } from "tsb"; + * import { readSql, toSql } from "tsb"; + * + * // Minimal in-memory adapter (illustrative — not a real DB) + * class MockAdapter implements SqlConnection { + * query(sql: string): SqlResult { + * return { columns: ["id", "name"], rows: [{ id: 1, name: "Alice" }] }; + * } + * } + * + * const db = new MockAdapter(); + * const df = readSql("SELECT * FROM users", db); + * ``` + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import { Index } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── SQL value types ────────────────────────────────────────────────────────── + +/** + * A scalar value that may be returned from a SQL query column. + * + * Covers the common ground across DB drivers: numbers, strings, booleans, + * `null` (SQL NULL), and raw byte buffers (SQL BLOB / BYTEA). + */ +export type SqlValue = string | number | boolean | null | Uint8Array; + +/** + * A single row from a SQL result set, mapping column name → value. + */ +export type SqlRow = Record; + +/** + * The complete result of executing a SQL query. + */ +export interface SqlResult { + /** Ordered list of column names as returned by the database. */ + readonly columns: readonly string[]; + /** All data rows. Each row is an object keyed by column name. */ + readonly rows: readonly SqlRow[]; +} + +// ─── connection adapter interface ───────────────────────────────────────────── + +/** + * Strategy for handling a pre-existing table in {@link toSql}. + * + * - `"fail"` — throw {@link TableExistsError} if the table already exists (default). + * - `"replace"` — drop and recreate the table, then insert all rows. + * - `"append"` — insert rows into the existing table without dropping it. + */ +export type IfExistsStrategy = "fail" | "replace" | "append"; + +/** + * Adapter interface for a SQL database connection. + * + * Implement this interface for your specific database driver and pass instances + * to {@link readSql}, {@link readSqlQuery}, {@link readSqlTable}, and + * {@link toSql}. + * + * Only {@link query} is required; all other methods are optional and enable + * more efficient or richer behaviour. + * + * @example + * ```ts + * // Minimal adapter wrapping better-sqlite3 + * import Database from "better-sqlite3"; + * import type { SqlConnection, SqlResult } from "tsb"; + * + * class BetterSqlite3Adapter implements SqlConnection { + * constructor(private readonly db: Database.Database) {} + * + * query(sql: string, params?: readonly SqlValue[]): SqlResult { + * const stmt = this.db.prepare(sql); + * const rows = stmt.all(...(params ?? [])) as SqlRow[]; + * const columns = rows.length > 0 ? Object.keys(rows[0]!) : []; + * return { columns, rows }; + * } + * + * listTables(): string[] { + * return (this.db.prepare( + * "SELECT name FROM sqlite_master WHERE type='table'", + * ).all() as { name: string }[]).map((r) => r.name); + * } + * } + * ``` + */ +export interface SqlConnection { + /** + * Execute a SQL query and return the result set. + * + * @param sql SQL string, which may include `?` (positional) or `$N` + * (numbered) placeholders — semantics depend on the driver. + * @param params Optional positional parameters bound to the placeholders. + */ + query(sql: string, params?: readonly SqlValue[]): SqlResult; + + /** + * Return the names of all tables visible through this connection. + * + * Used by {@link readSqlTable} to validate that the requested table exists. + * When omitted, no up-front validation is performed. + */ + listTables?(): readonly string[]; + + /** + * Insert rows into a table, applying the specified {@link IfExistsStrategy}. + * + * When provided, {@link toSql} delegates bulk insertion to this method, + * allowing the adapter to use database-native batch APIs. + * When omitted, {@link toSql} falls back to individual `INSERT INTO …` + * statements executed via {@link query}. + * + * @param tableName Target table. + * @param rows Row objects — each key is a column name. + * @param columns Ordered column names (matches keys in `rows`). + * @param ifExists How to handle a pre-existing table. + * @returns Number of rows inserted. + */ + insert?( + tableName: string, + rows: readonly SqlRow[], + columns: readonly string[], + ifExists: IfExistsStrategy, + ): number; +} + +// ─── public option types ────────────────────────────────────────────────────── + +/** + * Options shared by all read functions. + */ +export interface ReadSqlBaseOptions { + /** + * Column name or zero-based position to use as the DataFrame row index. + * When a string is given the column must exist in the result. + * When a number is given it selects by position. + * Default: `null` — a default `RangeIndex` is used. + */ + readonly indexCol?: string | number | null; + + /** + * Column names to parse as timestamps. + * Values are converted to milliseconds-since-epoch using `Date.parse()`. + * Non-parseable values are left as-is. + */ + readonly parseDates?: readonly string[]; +} + +/** + * Options for {@link readSqlQuery}. + */ +export interface ReadSqlQueryOptions extends ReadSqlBaseOptions { + /** + * Positional parameter bindings for the SQL query. + * Passed verbatim to {@link SqlConnection.query}. + */ + readonly params?: readonly SqlValue[]; +} + +/** + * Options for {@link readSqlTable}. + */ +export interface ReadSqlTableOptions extends ReadSqlBaseOptions { + /** + * Schema qualifier to prefix the table name (e.g. `"public"` in PostgreSQL). + * When provided the query uses `"".""`. + */ + readonly schema?: string; + + /** + * Subset of columns to retrieve. When omitted all columns are returned. + */ + readonly columns?: readonly string[]; +} + +/** + * Options for {@link readSql}. + * Combines {@link ReadSqlQueryOptions} and {@link ReadSqlTableOptions}. + */ +export interface ReadSqlOptions extends ReadSqlQueryOptions, ReadSqlTableOptions {} + +/** + * Options for {@link toSql}. + */ +export interface ToSqlOptions { + /** + * Behaviour when a table named `name` already exists. + * Default: `"fail"`. + */ + readonly ifExists?: IfExistsStrategy; + + /** + * Whether to write the DataFrame's row index as a column. + * Default: `true`. + */ + readonly index?: boolean; + + /** + * Column label to use for the written index column. + * Only effective when `index` is `true`. + * Default: the index name when set, otherwise `"index"`. + */ + readonly indexLabel?: string | null; + + /** + * Number of rows to insert per batch. + * Ignored when the adapter provides {@link SqlConnection.insert}. + * Default: all rows in a single batch. + */ + readonly chunksize?: number; +} + +// ─── errors ─────────────────────────────────────────────────────────────────── + +/** + * Thrown by {@link toSql} when `ifExists: "fail"` (the default) and the + * target table already exists. + */ +export class TableExistsError extends Error { + /** @param tableName The table that already exists. */ + constructor(tableName: string) { + super(`Table "${tableName}" already exists. Use ifExists: "replace" or "append".`); + this.name = "TableExistsError"; + } +} + +/** + * Thrown by {@link readSqlTable} when the requested table is not found. + */ +export class TableNotFoundError extends Error { + /** @param tableName The table that was not found. */ + constructor(tableName: string) { + super(`Table "${tableName}" not found in the database.`); + this.name = "TableNotFoundError"; + } +} + +// ─── internal helpers ───────────────────────────────────────────────────────── + +/** Convert a {@link SqlValue} to a tsb {@link Scalar}. */ +function sqlValueToScalar(v: SqlValue): Scalar { + if (v instanceof Uint8Array) { + // Represent BLOB as a JSON string of the hex encoding so it can sit in a + // string-typed Series without losing data. + return Buffer.from(v).toString("hex"); + } + return v; +} + +/** + * Build a DataFrame from a {@link SqlResult}, applying common options. + * + * @internal + */ +function resultToDataFrame(result: SqlResult, options: ReadSqlBaseOptions): DataFrame { + const { indexCol = null, parseDates } = options; + + // Resolve the index column name (if any). + let idxColName: string | null = null; + if (indexCol !== null && indexCol !== undefined) { + if (typeof indexCol === "number") { + const col = result.columns[indexCol]; + if (col !== undefined) { + idxColName = col; + } + } else { + idxColName = indexCol; + } + } + + // Build column arrays, excluding the index column. + const dataColumns: string[] = []; + const columnData: Record = {}; + + for (const col of result.columns) { + if (col === idxColName) continue; + dataColumns.push(col); + columnData[col] = []; + } + + // Populate column arrays. + for (const row of result.rows) { + for (const col of dataColumns) { + const arr = columnData[col]; + if (arr !== undefined) { + const raw = row[col]; + arr.push(raw !== undefined ? sqlValueToScalar(raw) : null); + } + } + } + + // Parse date columns (convert to ms-since-epoch numbers). + if (parseDates !== undefined) { + for (const col of parseDates) { + const arr = columnData[col]; + if (arr !== undefined) { + for (let i = 0; i < arr.length; i++) { + const v = arr[i]; + if (v !== null && v !== undefined && typeof v === "string") { + const ms = Date.parse(v); + arr[i] = Number.isNaN(ms) ? v : ms; + } + } + } + } + } + + // Build the row index. + const indexVals: Label[] = []; + if (idxColName !== null) { + for (const row of result.rows) { + const raw = row[idxColName]; + const v: SqlValue = raw !== undefined ? raw : null; + if (v instanceof Uint8Array) { + indexVals.push(Buffer.from(v).toString("hex")); + } else { + indexVals.push(v); + } + } + } + + const rowIndex = idxColName !== null ? new Index(indexVals, idxColName) : undefined; + + return DataFrame.fromColumns( + columnData as Record, + rowIndex !== undefined ? { index: rowIndex } : {}, + ); +} + +/** Quote an identifier with double-quotes (ANSI SQL). */ +function quoteIdent(name: string): string { + return `"${name.replace(/"/g, '""')}"`; +} + +/** Build a SELECT statement for {@link readSqlTable}. */ +function buildSelectQuery(tableName: string, options: ReadSqlTableOptions): string { + const { schema, columns } = options; + + const qualifiedTable = + schema !== undefined ? `${quoteIdent(schema)}.${quoteIdent(tableName)}` : quoteIdent(tableName); + + const colList = + columns !== undefined && columns.length > 0 ? columns.map(quoteIdent).join(", ") : "*"; + + return `SELECT ${colList} FROM ${qualifiedTable}`; +} + +/** + * Heuristic: does the string look like a SQL query (contains whitespace) or a + * plain table name? + */ +function looksLikeQuery(sqlOrTable: string): boolean { + return /\s/.test(sqlOrTable.trim()); +} + +// ─── public API ─────────────────────────────────────────────────────────────── + +/** + * Execute a SQL SELECT query and return the result as a {@link DataFrame}. + * + * Mirrors `pandas.read_sql_query()`. + * + * ```ts + * import { readSqlQuery } from "tsb"; + * + * const df = readSqlQuery("SELECT id, name FROM users WHERE active = ?", db, { + * params: [1], + * indexCol: "id", + * }); + * ``` + * + * @param sql SQL SELECT string (may include parameter placeholders). + * @param conn Database adapter implementing {@link SqlConnection}. + * @param options See {@link ReadSqlQueryOptions}. + */ +export function readSqlQuery( + sql: string, + conn: SqlConnection, + options: ReadSqlQueryOptions = {}, +): DataFrame { + const { params } = options; + const result = params !== undefined ? conn.query(sql, params) : conn.query(sql); + return resultToDataFrame(result, options); +} + +/** + * Read an entire database table into a {@link DataFrame}. + * + * Mirrors `pandas.read_sql_table()`. + * + * ```ts + * import { readSqlTable } from "tsb"; + * + * const df = readSqlTable("products", db, { + * schema: "inventory", + * columns: ["id", "name", "price"], + * }); + * ``` + * + * @param tableName Name of the table to read. + * @param conn Database adapter implementing {@link SqlConnection}. + * @param options See {@link ReadSqlTableOptions}. + */ +export function readSqlTable( + tableName: string, + conn: SqlConnection, + options: ReadSqlTableOptions = {}, +): DataFrame { + if (conn.listTables !== undefined) { + const tables = conn.listTables(); + const tableNameLower = tableName.toLowerCase(); + const found = tables.some((t) => t.toLowerCase() === tableNameLower); + if (!found) { + throw new TableNotFoundError(tableName); + } + } + + const sql = buildSelectQuery(tableName, options); + const result = conn.query(sql); + return resultToDataFrame(result, options); +} + +/** + * Read a SQL query **or** table name into a {@link DataFrame}. + * + * Mirrors `pandas.read_sql()`. + * + * - If `sqlOrTable` contains whitespace it is treated as a SQL query string + * and executed via {@link readSqlQuery}. + * - Otherwise it is treated as a table name and delegated to + * {@link readSqlTable}. + * + * ```ts + * import { readSql } from "tsb"; + * + * // Using a query + * const df1 = readSql("SELECT * FROM orders WHERE status = 'open'", db); + * + * // Using a table name + * const df2 = readSql("orders", db); + * ``` + * + * @param sqlOrTable SQL query string or bare table name. + * @param conn Database adapter implementing {@link SqlConnection}. + * @param options See {@link ReadSqlOptions}. + */ +export function readSql( + sqlOrTable: string, + conn: SqlConnection, + options: ReadSqlOptions = {}, +): DataFrame { + if (looksLikeQuery(sqlOrTable)) { + return readSqlQuery(sqlOrTable, conn, options); + } + return readSqlTable(sqlOrTable, conn, options); +} + +/** + * Write a {@link DataFrame} to a SQL table. + * + * Mirrors `pandas.DataFrame.to_sql()`. + * + * When the adapter provides an {@link SqlConnection.insert} method, writes are + * delegated to it (enabling driver-native batching). Otherwise each row is + * written via an individual `INSERT INTO` statement through + * {@link SqlConnection.query}. + * + * ```ts + * import { toSql } from "tsb"; + * + * const rowsWritten = toSql(df, "staging_data", db, { ifExists: "replace" }); + * ``` + * + * @param df Source DataFrame. + * @param tableName Destination table name. + * @param conn Database adapter implementing {@link SqlConnection}. + * @param options See {@link ToSqlOptions}. + * @returns Number of rows written. + */ +export function toSql( + df: DataFrame, + tableName: string, + conn: SqlConnection, + options: ToSqlOptions = {}, +): number { + const { ifExists = "fail", index = true, indexLabel = null, chunksize } = options; + + // Build ordered column list. + const dataCols = [...df.columns.values] as string[]; + const allCols: string[] = []; + let idxLabel = "index"; + if (index) { + const nameFromIndex = df.index.name; + if (indexLabel !== null && indexLabel !== undefined) { + idxLabel = indexLabel; + } else if (typeof nameFromIndex === "string" && nameFromIndex.length > 0) { + idxLabel = nameFromIndex; + } + allCols.push(idxLabel); + } + for (const c of dataCols) { + allCols.push(c); + } + + // Build row objects. + const records = df.toRecords(); + const indexValues = [...df.index.values] as Label[]; + const rows: SqlRow[] = []; + + for (let i = 0; i < records.length; i++) { + const rec = records[i]; + const row: SqlRow = {}; + if (index) { + const idxVal = indexValues[i]; + row[idxLabel] = labelToSqlValue(idxVal !== undefined ? idxVal : null); + } + if (rec !== undefined) { + for (const col of dataCols) { + const v = rec[col]; + row[col] = scalarToSqlValue(v !== undefined ? v : null); + } + } + rows.push(row); + } + + if (conn.insert !== undefined) { + return conn.insert(tableName, rows, allCols, ifExists); + } + + // Fallback: emit INSERT statements via query(). + return insertViaQuery(tableName, rows, allCols, ifExists, chunksize, conn); +} + +// ─── helpers for toSql ──────────────────────────────────────────────────────── + +/** Convert a {@link Label} to a {@link SqlValue}. */ +function labelToSqlValue(label: Label): SqlValue { + if (label === null) return null; + if (typeof label === "boolean") return label; + if (typeof label === "number") return label; + if (typeof label === "string") return label; + if (label instanceof Date) return label.toISOString(); + return String(label); +} + +/** Convert a tsb {@link Scalar} to a {@link SqlValue}. */ +function scalarToSqlValue(s: Scalar): SqlValue { + if (s === null || s === undefined) return null; + if (typeof s === "boolean") return s; + if (typeof s === "number") return s; + if (typeof s === "string") return s; + if (typeof s === "bigint") return Number(s); + if (s instanceof Date) return s.toISOString(); + // TimedeltaLike — store as total milliseconds + if (typeof s === "object" && "totalMs" in s) return s.totalMs; + return null; +} + +/** + * Escape a string for inclusion in a SQL literal. + * Only used in the fallback query path. + */ +function escapeSqlString(s: string): string { + return s.replace(/'/g, "''"); +} + +/** Format a {@link SqlValue} as a SQL literal for the fallback path. */ +function sqlLiteral(v: SqlValue): string { + if (v === null) return "NULL"; + if (typeof v === "boolean") return v ? "1" : "0"; + if (typeof v === "number") { + if (Number.isNaN(v)) return "NULL"; + if (!Number.isFinite(v)) return "NULL"; + return String(v); + } + if (typeof v === "string") return `'${escapeSqlString(v)}'`; + // Uint8Array (blob): represent as hex literal (SQLite: X'…') + return `X'${Buffer.from(v).toString("hex")}'`; +} + +/** + * Insert rows by emitting individual INSERT statements through + * {@link SqlConnection.query}. Falls back for adapters that don't implement + * {@link SqlConnection.insert}. + */ +function insertViaQuery( + tableName: string, + rows: readonly SqlRow[], + columns: readonly string[], + ifExists: IfExistsStrategy, + chunksize: number | undefined, + conn: SqlConnection, +): number { + if (rows.length === 0) return 0; + + const quotedTable = quoteIdent(tableName); + const colList = columns.map(quoteIdent).join(", "); + + // Check for pre-existing table when strategy is "fail". + if (ifExists === "fail" && conn.listTables !== undefined) { + const tables = conn.listTables(); + const tl = tableName.toLowerCase(); + if (tables.some((t) => t.toLowerCase() === tl)) { + throw new TableExistsError(tableName); + } + } + + // "replace": attempt DROP TABLE first. + if (ifExists === "replace") { + try { + conn.query(`DROP TABLE IF EXISTS ${quotedTable}`); + } catch { + // Some minimal adapters may not support DDL via query(). + } + } + + const batchSize = chunksize !== undefined && chunksize > 0 ? chunksize : rows.length; + let written = 0; + + for (let start = 0; start < rows.length; start += batchSize) { + const end = Math.min(start + batchSize, rows.length); + + for (let i = start; i < end; i++) { + const row = rows[i]; + if (row === undefined) continue; + const valList = columns.map((col) => sqlLiteral(row[col] ?? null)).join(", "); + conn.query(`INSERT INTO ${quotedTable} (${colList}) VALUES (${valList})`); + written += 1; + } + } + + return written; +} diff --git a/src/io/stata.ts b/src/io/stata.ts new file mode 100644 index 00000000..b5151660 --- /dev/null +++ b/src/io/stata.ts @@ -0,0 +1,1149 @@ +/** + * readStata / toStata — Stata DTA file I/O for DataFrame. + * + * Mirrors `pandas.read_stata()` and `DataFrame.to_stata()`: + * - `readStata(data, options?)` — parse a Stata DTA binary buffer into a DataFrame + * - `toStata(df, options?)` — serialize a DataFrame to a Stata DTA binary buffer + * + * Supported DTA versions: + * - Reading: v114/v115 (old binary format, auto-detects byte order) + * - Reading: v117/v118/v119 (new XML-tagged format, auto-detects byte order) + * - Writing: v118 (new format, little-endian) + * + * Column types handled: + * - byte (int8), int (int16), long (int32), float (float32), double (float64) + * - str1..str2045 (fixed-width strings), strl (long strings, v117+) + * - Missing values → `null` + * - Value labels optionally applied with `convertCategoricals: true` + * + * @module + */ + +import { DataFrame } from "../core/frame.ts"; +import { Index } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── Public Types ───────────────────────────────────────────────────────────── + +/** Options for {@link readStata}. */ +export interface ReadStataOptions { + /** + * Column name or 0-based index to use as the row index. + * Default: `null` (RangeIndex). + */ + readonly indexCol?: string | number | null; + /** Maximum number of data rows to read. Default: unlimited. */ + readonly nRows?: number; + /** + * Apply value labels to integer columns that have them, replacing + * numeric codes with their string labels. Default: `false`. + */ + readonly convertCategoricals?: boolean; + /** + * Only include these column names. `null` = all columns. + * Default: `null`. + */ + readonly usecols?: readonly string[] | null; +} + +/** Options for {@link toStata}. */ +export interface ToStataOptions { + /** Dataset label (up to 80 characters). Default: `""`. */ + readonly dataLabel?: string; + /** + * Write the DataFrame's row index as a column named `"_index"`. + * Default: `false`. + */ + readonly writeIndex?: boolean; + /** + * Map of column name → variable label (up to 80 characters). + * Default: `{}`. + */ + readonly variableLabels?: Readonly>; +} + +// ─── Internal Types ─────────────────────────────────────────────────────────── + +/** Column descriptor parsed from a DTA file. */ +interface ColDesc { + readonly name: string; + /** Raw Stata type code. */ + readonly code: number; + /** Byte width of this column in the data section. */ + readonly width: number; + /** True if this column holds a strl reference (v117+). */ + readonly isStrl: boolean; +} + +/** Internal representation of a fully parsed DTA file. */ +interface DtaData { + readonly cols: ColDesc[]; + readonly rows: Scalar[][]; + readonly lblNames: string[]; + readonly varLabels: string[]; + readonly valueLabels: Map>; +} + +// ─── Constants ──────────────────────────────────────────────────────────────── + +/** New-format (v117+) numeric type codes. */ +const TC_DOUBLE = 65526; +const TC_FLOAT = 65527; +const TC_LONG = 65528; +const TC_INT = 65529; +const TC_BYTE = 65530; +const TC_STRL = 32768; + +/** Missing-value sentinels for integer types. */ +const MISS_BYTE = 101; // int8 >= 101 is missing +const MISS_INT = 32741; // int16 >= 32741 is missing +const MISS_LONG = 2147483621; // int32 >= 2147483621 is missing + +/** Stata float missing: bit pattern 0x7f000000 or higher. */ +const MISS_F32_BITS = 0x7f000000; +/** Stata double missing: high-32-bit pattern 0x7fe00000 or higher. */ +const MISS_F64_HI = 0x7fe00000; +/** Stata double missing written as uint32 pair (LE). */ +const MISS_F64_LO32 = 0x00000000; +const MISS_F64_HI32 = 0x7fe00000; + +// ─── Missing Value Helpers ──────────────────────────────────────────────────── + +function isMissF32(view: DataView, pos: number, le: boolean): boolean { + const bits = view.getUint32(pos, le); + // Stata float missing values have sign=0 and bits >= 0x7f000000. + // Negative floats have bit 31 set (bits >= 0x80000000) and must not be treated as missing. + return bits >= MISS_F32_BITS && bits < 0x80000000; +} + +function isMissF64(view: DataView, pos: number, le: boolean): boolean { + const hiOff = le ? pos + 4 : pos; + const hi = view.getUint32(hiOff, le); + // Stata double missing values have sign=0 and high bits >= 0x7fe00000. + // Negative doubles have bit 31 set (hi >= 0x80000000) and must not be treated as missing. + return hi >= MISS_F64_HI && hi < 0x80000000; +} + +// ─── Text Codecs ────────────────────────────────────────────────────────────── + +const ENC = new TextEncoder(); +const LATIN1 = new TextDecoder("latin1"); +const UTF8D = new TextDecoder("utf-8"); + +// ─── BinReader ──────────────────────────────────────────────────────────────── + +class BinReader { + pos = 0; + /** Byte order: `true` = little-endian, `false` = big-endian. Mutable. */ + le: boolean; + private readonly view: DataView; + readonly u8: Uint8Array; + + constructor(data: Uint8Array | ArrayBuffer, le = true) { + if (data instanceof ArrayBuffer) { + this.u8 = new Uint8Array(data); + this.view = new DataView(data); + } else { + this.u8 = data; + this.view = new DataView(data.buffer, data.byteOffset, data.byteLength); + } + this.le = le; + } + + seek(p: number): void { + this.pos = p; + } + + skip(n: number): void { + this.pos += n; + } + + readU8(): number { + return this.view.getUint8(this.pos++); + } + + readI8(): number { + return this.view.getInt8(this.pos++); + } + + readU16(): number { + const v = this.view.getUint16(this.pos, this.le); + this.pos += 2; + return v; + } + + readI16(): number { + const v = this.view.getInt16(this.pos, this.le); + this.pos += 2; + return v; + } + + readU32(): number { + const v = this.view.getUint32(this.pos, this.le); + this.pos += 4; + return v; + } + + readI32(): number { + const v = this.view.getInt32(this.pos, this.le); + this.pos += 4; + return v; + } + + readF32(): number { + const v = this.view.getFloat32(this.pos, this.le); + this.pos += 4; + return v; + } + + readF64(): number { + const v = this.view.getFloat64(this.pos, this.le); + this.pos += 8; + return v; + } + + /** Read uint64 as a JS number (safe for values ≤ 2^53). */ + readU64(): number { + const a = this.view.getUint32(this.pos, this.le); + const b = this.view.getUint32(this.pos + 4, this.le); + this.pos += 8; + return this.le ? a + b * 4294967296 : b + a * 4294967296; + } + + readBytes(n: number): Uint8Array { + const s = this.u8.subarray(this.pos, this.pos + n); + this.pos += n; + return s; + } + + /** Read a fixed-width field as a null-terminated Latin-1 string. */ + readCStr(fieldLen: number): string { + const b = this.readBytes(fieldLen); + let end = 0; + while (end < b.length && (b[end] ?? 0) !== 0) { + end++; + } + return LATIN1.decode(b.subarray(0, end)); + } + + /** Read a fixed-width field, trim trailing null bytes and spaces. */ + readTrimStr(fieldLen: number): string { + const b = this.readBytes(fieldLen); + let end = b.length; + while (end > 0 && ((b[end - 1] ?? 0) === 0 || (b[end - 1] ?? 0) === 0x20)) { + end--; + } + return LATIN1.decode(b.subarray(0, end)); + } + + /** Read and verify an ASCII tag. Throws on mismatch. */ + expectTag(tag: string): void { + const tb = ENC.encode(tag); + for (let i = 0; i < tb.length; i++) { + if ((this.u8[this.pos + i] ?? -1) !== (tb[i] ?? 0)) { + const got = LATIN1.decode(this.u8.subarray(this.pos, this.pos + tb.length)); + throw new Error(`Stata DTA: expected "${tag}", got "${got}" at offset ${this.pos}`); + } + } + this.pos += tb.length; + } + + /** Scan forward until the given ASCII tag is found and consumed. */ + skipToTag(tag: string): void { + const tb = ENC.encode(tag); + const len = tb.length; + for (let i = this.pos; i + len <= this.u8.length; i++) { + let ok = true; + for (let j = 0; j < len; j++) { + if (this.u8[i + j] !== tb[j]) { + ok = false; + break; + } + } + if (ok) { + this.pos = i + len; + return; + } + } + throw new Error(`Stata DTA: tag "${tag}" not found`); + } + + get dataView(): DataView { + return this.view; + } +} + +// ─── BinWriter ──────────────────────────────────────────────────────────────── + +class BinWriter { + private buf: Uint8Array; + private _pos = 0; + private view: DataView; + readonly le: boolean; + + constructor(capacity = 8192, le = true) { + this.buf = new Uint8Array(capacity); + this.view = new DataView(this.buf.buffer); + this.le = le; + } + + get pos(): number { + return this._pos; + } + + private grow(need: number): void { + if (this._pos + need <= this.buf.length) return; + let next = this.buf.length * 2; + while (this._pos + need > next) next *= 2; + const nb = new Uint8Array(next); + nb.set(this.buf.subarray(0, this._pos)); + this.buf = nb; + this.view = new DataView(nb.buffer); + } + + writeU8(v: number): void { + this.grow(1); + this.view.setUint8(this._pos++, v); + } + + writeI8(v: number): void { + this.grow(1); + this.view.setInt8(this._pos++, v); + } + + writeU16(v: number): void { + this.grow(2); + this.view.setUint16(this._pos, v, this.le); + this._pos += 2; + } + + writeI16(v: number): void { + this.grow(2); + this.view.setInt16(this._pos, v, this.le); + this._pos += 2; + } + + writeU32(v: number): void { + this.grow(4); + this.view.setUint32(this._pos, v, this.le); + this._pos += 4; + } + + writeI32(v: number): void { + this.grow(4); + this.view.setInt32(this._pos, v, this.le); + this._pos += 4; + } + + writeF32(v: number): void { + this.grow(4); + this.view.setFloat32(this._pos, v, this.le); + this._pos += 4; + } + + writeF64(v: number): void { + this.grow(8); + this.view.setFloat64(this._pos, v, this.le); + this._pos += 8; + } + + writeU64(v: number): void { + this.grow(8); + const lo = v >>> 0; + const hi = Math.floor(v / 4294967296) >>> 0; + if (this.le) { + this.view.setUint32(this._pos, lo, true); + this.view.setUint32(this._pos + 4, hi, true); + } else { + this.view.setUint32(this._pos, hi, false); + this.view.setUint32(this._pos + 4, lo, false); + } + this._pos += 8; + } + + /** Overwrite a previously-written uint64 value at `offset`. */ + patchU64(offset: number, v: number): void { + const lo = v >>> 0; + const hi = Math.floor(v / 4294967296) >>> 0; + if (this.le) { + this.view.setUint32(offset, lo, true); + this.view.setUint32(offset + 4, hi, true); + } else { + this.view.setUint32(offset, hi, false); + this.view.setUint32(offset + 4, lo, false); + } + } + + writeBytes(b: Uint8Array): void { + this.grow(b.length); + this.buf.set(b, this._pos); + this._pos += b.length; + } + + writeAscii(s: string): void { + this.writeBytes(ENC.encode(s)); + } + + /** Write a null-padded fixed-length ASCII field of exactly `fieldLen` bytes. */ + writeFixed(s: string, fieldLen: number): void { + this.grow(fieldLen); + const b = ENC.encode(s); + const n = Math.min(b.length, fieldLen); + for (let i = 0; i < n; i++) this.view.setUint8(this._pos + i, b[i] ?? 0); + for (let i = n; i < fieldLen; i++) this.view.setUint8(this._pos + i, 0); + this._pos += fieldLen; + } + + finalize(): Uint8Array { + return this.buf.slice(0, this._pos); + } +} + +// ─── Old Format Parser (v114/v115) ──────────────────────────────────────────── + +function parseOldFormat(u8: Uint8Array, version: number): DtaData { + const byteOrderCode = u8[1] ?? 2; + const le = byteOrderCode === 2; // 2 = LOHI (little-endian), 1 = HILO (big-endian) + const r = new BinReader(u8, le); + + r.skip(4); // ds_format, byte_order, filetype, padding + const nvar = r.readU16(); + const nobs = r.readU32(); + r.readCStr(81); // data_label (ignored) + r.readCStr(18); // time_stamp (ignored) + // offset = 109 + + // typlist: 1 byte per column + const stataTypes: number[] = []; + for (let i = 0; i < nvar; i++) stataTypes.push(r.readU8()); + + // varlist + const colSize = version > 113 ? 33 : 10; + const names: string[] = []; + for (let i = 0; i < nvar; i++) names.push(r.readCStr(colSize)); + + // srtlist (skip) + r.skip((nvar + 1) * 2); + + // fmtlist (skip) + const fmtSize = version > 113 ? 49 : 13; + r.skip(nvar * fmtSize); + + // lbllist (value label names) + const lblSize = version > 113 ? 33 : 10; + const lblNames: string[] = []; + for (let i = 0; i < nvar; i++) lblNames.push(r.readCStr(lblSize)); + + // variable_labels + const varLabels: string[] = []; + for (let i = 0; i < nvar; i++) varLabels.push(r.readCStr(81)); + + // characteristics: skip until end marker (type == 0) + while (r.pos + 2 < u8.length) { + const chType = r.readU16(); + if (chType === 0) break; + r.skip(colSize); // varname + r.skip(colSize); // charname + const len = r.readU32(); + r.skip(len); + } + + // Build column descriptors + const cols: ColDesc[] = []; + for (let i = 0; i < nvar; i++) { + const t = stataTypes[i] ?? 255; + let width: number; + if (t <= 244) { + width = t; // str + } else if (t === 251) { + width = 1; // byte + } else if (t === 252) { + width = 2; // int + } else if (t === 253 || t === 254) { + width = 4; // long or float + } else { + width = 8; // double (255) or unknown + } + cols.push({ name: names[i] ?? `var${i}`, code: t, width, isStrl: false }); + } + + // Read data rows + const dv = r.dataView; + const rows: Scalar[][] = []; + for (let row = 0; row < nobs; row++) { + const rowData: Scalar[] = []; + for (const col of cols) { + const t = col.code; + if (t <= 244) { + rowData.push(r.readTrimStr(t)); + } else if (t === 251) { + // byte (int8): missing if >= MISS_BYTE + const v = r.readI8(); + rowData.push(v >= MISS_BYTE ? null : v); + } else if (t === 252) { + // int (int16): missing if >= MISS_INT + const v = r.readI16(); + rowData.push(v >= MISS_INT ? null : v); + } else if (t === 253) { + // long (int32): missing if >= MISS_LONG + const v = r.readI32(); + rowData.push(v >= MISS_LONG ? null : v); + } else if (t === 254) { + // float (float32): check bit pattern + const missing = isMissF32(dv, r.pos, le); + const v = r.readF32(); + rowData.push(missing ? null : v); + } else { + // double (float64): check bit pattern + const missing = isMissF64(dv, r.pos, le); + const v = r.readF64(); + rowData.push(missing ? null : v); + } + } + rows.push(rowData); + } + + const valueLabels = parseOldValueLabels(r, version); + return { cols, rows, lblNames, varLabels, valueLabels }; +} + +function parseOldValueLabels(r: BinReader, version: number): Map> { + const result = new Map>(); + const lblSize = version > 113 ? 33 : 10; + + while (r.pos + lblSize + 11 < r.u8.length) { + const labname = r.readCStr(lblSize); + r.skip(3); // padding + const n = r.readU32(); + const txtlen = r.readU32(); + if (labname.length === 0 || n === 0 || txtlen === 0) break; + if (r.pos + n * 8 + txtlen > r.u8.length) break; + + const offsets: number[] = []; + for (let i = 0; i < n; i++) offsets.push(r.readU32()); + const values: number[] = []; + for (let i = 0; i < n; i++) values.push(r.readI32()); + const txt = r.readBytes(txtlen); + + const map = new Map(); + for (let i = 0; i < n; i++) { + const off = offsets[i] ?? 0; + let end = off; + while (end < txt.length && (txt[end] ?? 0) !== 0) end++; + const label = LATIN1.decode(txt.subarray(off, end)); + const val = values[i]; + if (val !== undefined) map.set(val, label); + } + result.set(labname, map); + } + return result; +} + +// ─── New Format Parser (v117/v118/v119) ─────────────────────────────────────── + +function parseNewFormat(u8: Uint8Array, version: number): DtaData { + const r = new BinReader(u8, true); // initially LE; updated after reading byteorder + + r.expectTag(""); + r.expectTag("
"); + r.expectTag(""); + r.skip(3); // 3-byte ASCII version string + r.expectTag(""); + r.expectTag(""); + const bo = LATIN1.decode(r.readBytes(3)); + r.le = bo !== "MSF"; // "LSF" = little-endian, "MSF" = big-endian + r.expectTag(""); + r.expectTag(""); + const nvar = r.readU16(); + r.expectTag(""); + r.expectTag(""); + const nobs = version >= 119 ? r.readU64() : r.readU32(); + r.expectTag(""); + r.expectTag(""); + r.expectTag(""); + const tsLen = version > 117 ? r.readU16() : r.readU8(); + r.skip(tsLen); + r.expectTag(""); + r.expectTag("
"); + + // Map: 14 × uint64 file offsets + r.expectTag(""); + const mapOff: number[] = []; + for (let i = 0; i < 14; i++) mapOff.push(r.readU64()); + r.expectTag(""); + + // variable_types + const seekVT = mapOff[2] ?? 0; + if (seekVT > 0) r.seek(seekVT); + r.expectTag(""); + const varCodes: number[] = []; + for (let i = 0; i < nvar; i++) varCodes.push(r.readU16()); + r.expectTag(""); + + // varnames + const seekVN = mapOff[3] ?? 0; + if (seekVN > 0) r.seek(seekVN); + r.expectTag(""); + const varNameLen = version >= 119 ? 129 : 33; + const names: string[] = []; + for (let i = 0; i < nvar; i++) names.push(r.readCStr(varNameLen)); + r.expectTag(""); + + // value_label_names (skip sortlist and formats) + const seekVLN = mapOff[6] ?? 0; + if (seekVLN > 0) r.seek(seekVLN); + r.expectTag(""); + const vlNameLen = version >= 119 ? 129 : 33; + const lblNames: string[] = []; + for (let i = 0; i < nvar; i++) lblNames.push(r.readCStr(vlNameLen)); + r.expectTag(""); + + // variable_labels + const seekVL = mapOff[7] ?? 0; + if (seekVL > 0) r.seek(seekVL); + r.expectTag(""); + const varLabels: string[] = []; + for (let i = 0; i < nvar; i++) varLabels.push(r.readCStr(81)); + r.expectTag(""); + + // Build column descriptors + const cols: ColDesc[] = []; + for (let i = 0; i < nvar; i++) { + const code = varCodes[i] ?? TC_DOUBLE; + let width: number; + let isStrl = false; + if (code <= 2045) { + width = code; // str (fixed string of that length) + } else if (code === TC_STRL) { + // strl reference: uint16 v + uint32 o (v117) or uint64 o (v118+) + width = version >= 118 ? 10 : 6; + isStrl = true; + } else if (code === TC_BYTE) { + width = 1; + } else if (code === TC_INT) { + width = 2; + } else if (code === TC_LONG || code === TC_FLOAT) { + width = 4; + } else { + width = 8; // TC_DOUBLE or unknown + } + cols.push({ name: names[i] ?? `var${i}`, code, width, isStrl }); + } + + // Read strls section if any strl columns exist + const strlMap = new Map(); // "v,o" → string value + const seekST = mapOff[10] ?? 0; + if (seekST > 0 && cols.some((c) => c.isStrl)) { + r.seek(seekST); + r.expectTag(""); + while (r.pos + 3 <= r.u8.length) { + if ((r.u8[r.pos] ?? 0) === 0x3c) break; // '<' = start of + // Check for "GSO" magic + if ( + (r.u8[r.pos] ?? 0) !== 0x47 || + (r.u8[r.pos + 1] ?? 0) !== 0x53 || + (r.u8[r.pos + 2] ?? 0) !== 0x4f + ) { + break; + } + r.skip(3); // "GSO" + const gsoV = r.readU16(); + const gsoO = version >= 118 ? r.readU64() : r.readU32(); + const t = r.readU8(); // 129=binary, 130=string + const len = r.readU32(); + const data = r.readBytes(len); + if (t === 130) { + // string: null-terminated UTF-8 + let end = 0; + while (end < data.length && (data[end] ?? 0) !== 0) end++; + strlMap.set(`${gsoV},${gsoO}`, UTF8D.decode(data.subarray(0, end))); + } + } + r.skipToTag(""); + } + + // Read data section + const seekDA = mapOff[9] ?? 0; + if (seekDA > 0) r.seek(seekDA); + r.expectTag(""); + const dv = r.dataView; + const rows: Scalar[][] = []; + for (let row = 0; row < nobs; row++) { + const rowData: Scalar[] = []; + for (const col of cols) { + const code = col.code; + if (code <= 2045) { + rowData.push(r.readTrimStr(code)); + } else if (col.isStrl) { + const gv = r.readU16(); + const go = version >= 118 ? r.readU64() : r.readU32(); + rowData.push(strlMap.get(`${gv},${go}`) ?? null); + } else if (code === TC_BYTE) { + const v = r.readI8(); + rowData.push(v >= MISS_BYTE ? null : v); + } else if (code === TC_INT) { + const v = r.readI16(); + rowData.push(v >= MISS_INT ? null : v); + } else if (code === TC_LONG) { + const v = r.readI32(); + rowData.push(v >= MISS_LONG ? null : v); + } else if (code === TC_FLOAT) { + const missing = isMissF32(dv, r.pos, r.le); + const v = r.readF32(); + rowData.push(missing ? null : v); + } else { + // TC_DOUBLE + const missing = isMissF64(dv, r.pos, r.le); + const v = r.readF64(); + rowData.push(missing ? null : v); + } + } + rows.push(rowData); + } + r.expectTag(""); + + // Value labels + const seekVA = mapOff[11] ?? 0; + if (seekVA > 0) r.seek(seekVA); + const valueLabels = parseNewValueLabels(r, version); + return { cols, rows, lblNames, varLabels, valueLabels }; +} + +function parseNewValueLabels(r: BinReader, version: number): Map> { + const result = new Map>(); + const lblSize = version >= 119 ? 129 : 33; + + r.expectTag(""); + while (r.pos + 5 < r.u8.length) { + if ((r.u8[r.pos] ?? 0) === 0x3c && (r.u8[r.pos + 1] ?? 0) === 0x2f) break; // ""); + r.readU32(); // total byte length (informational) + const labname = r.readCStr(lblSize); + r.skip(3); // padding + const n = r.readU32(); + const txtlen = r.readU32(); + const offsets: number[] = []; + for (let i = 0; i < n; i++) offsets.push(r.readU32()); + const values: number[] = []; + for (let i = 0; i < n; i++) values.push(r.readI32()); + const txt = r.readBytes(txtlen); + r.expectTag(""); + + if (labname.length > 0 && n > 0) { + const map = new Map(); + for (let i = 0; i < n; i++) { + const off = offsets[i] ?? 0; + let end = off; + while (end < txt.length && (txt[end] ?? 0) !== 0) end++; + const label = UTF8D.decode(txt.subarray(off, end)); + const val = values[i]; + if (val !== undefined) map.set(val, label); + } + result.set(labname, map); + } + } + return result; +} + +// ─── DataFrame Builder ──────────────────────────────────────────────────────── + +function isLabel(v: Scalar): v is Label { + return ( + v === null || + typeof v === "number" || + typeof v === "string" || + typeof v === "boolean" || + v instanceof Date + ); +} + +function buildDataFrame(data: DtaData, opts: ReadStataOptions): DataFrame { + const { cols, rows, lblNames, valueLabels } = data; + const { indexCol = null, nRows, convertCategoricals = false, usecols = null } = opts; + const limit = nRows !== undefined ? Math.min(nRows, rows.length) : rows.length; + + // Determine active column indices + let activeIdx = cols.map((_, i) => i); + if (usecols !== null) { + const keep = new Set(usecols); + activeIdx = activeIdx.filter((i) => keep.has(cols[i]?.name ?? "")); + } + + // Build column arrays from rows + const arrays: Scalar[][] = activeIdx.map(() => []); + for (let ri = 0; ri < limit; ri++) { + const row = rows[ri]; + if (row === undefined) continue; + for (let ci = 0; ci < activeIdx.length; ci++) { + const colIdx = activeIdx[ci] ?? 0; + (arrays[ci] ?? []).push(row[colIdx] ?? null); + } + } + + // Apply value labels (convertCategoricals) + if (convertCategoricals) { + for (let ci = 0; ci < activeIdx.length; ci++) { + const colIdx = activeIdx[ci] ?? 0; + const lblName = lblNames[colIdx] ?? ""; + if (lblName.length === 0) continue; + const lblMap = valueLabels.get(lblName); + if (lblMap === undefined) continue; + const arr = arrays[ci]; + if (arr === undefined) continue; + for (let ri = 0; ri < arr.length; ri++) { + const v = arr[ri]; + if (typeof v === "number") { + const label = lblMap.get(v); + if (label !== undefined) arr[ri] = label; + } + } + } + } + + // Build column data record + const colData: Record = {}; + for (let ci = 0; ci < activeIdx.length; ci++) { + const colIdx = activeIdx[ci] ?? 0; + colData[cols[colIdx]?.name ?? `var${colIdx}`] = arrays[ci] ?? []; + } + + // Handle indexCol + let idxName: string | null = null; + if (typeof indexCol === "string") { + idxName = indexCol; + } else if (typeof indexCol === "number") { + const mapped = activeIdx[indexCol]; + if (mapped !== undefined) idxName = cols[mapped]?.name ?? null; + } + + if (idxName !== null && idxName in colData) { + const idxData = (colData[idxName] ?? []).filter(isLabel); + const rest: Record = {}; + for (const [k, v] of Object.entries(colData)) { + if (k !== idxName) rest[k] = v; + } + return DataFrame.fromColumns(rest, { index: new Index(idxData) }); + } + + return DataFrame.fromColumns(colData); +} + +// ─── readStata ──────────────────────────────────────────────────────────────── + +/** + * Parse a Stata DTA file into a {@link DataFrame}. + * + * Supports DTA versions 114/115 (old binary format) and 117/118/119 + * (new XML-tagged format). Numeric missing values are represented as `null`. + * + * @example + * ```ts + * import { readStata } from "tsb"; + * const buf = await Bun.file("data.dta").arrayBuffer(); + * const df = readStata(buf); + * df.shape; // [nobs, nvar] + * df.columns.toArray(); // ["age", "income", ...] + * ``` + */ +export function readStata( + data: Uint8Array | ArrayBuffer, + options: ReadStataOptions = {}, +): DataFrame { + const u8 = data instanceof Uint8Array ? data : new Uint8Array(data); + if (u8.length < 4) throw new Error("Stata DTA: buffer too small"); + + let parsed: DtaData; + const firstByte = u8[0] ?? 0; + + if (firstByte === 0x3c) { + // New format: starts with "" + const header100 = LATIN1.decode(u8.subarray(0, Math.min(100, u8.length))); + const m = /(\d+)<\/release>/.exec(header100); + const version = m?.[1] !== undefined ? Number.parseInt(m[1], 10) : 118; + parsed = parseNewFormat(u8, version); + } else { + // Old binary format: first byte is the version number + const version = firstByte; + if (version < 104 || version > 115) { + throw new Error(`Stata DTA: unsupported version byte ${version}`); + } + parsed = parseOldFormat(u8, version); + } + + return buildDataFrame(parsed, options); +} + +// ─── toStata ───────────────────────────────────────────────────────────────── + +/** + * Serialize a {@link DataFrame} to a Stata DTA v118 binary file. + * + * Column type mapping: + * - `number` → `double` (float64) + * - `boolean` → `byte` (int8, stored as 0/1) + * - `string` → `str` (fixed-width, up to 2045 bytes; longer strings truncated) + * - `null` / `undefined` → Stata missing value for the column's type + * + * @example + * ```ts + * import { DataFrame, toStata } from "tsb"; + * const df = DataFrame.fromColumns({ + * age: [25, 30, null], + * name: ["Alice", "Bob", "Carol"], + * }); + * const buf = toStata(df); + * await Bun.write("data.dta", buf); + * ``` + */ +export function toStata(df: DataFrame, options: ToStataOptions = {}): Uint8Array { + const { dataLabel = "", writeIndex = false, variableLabels = {} } = options; + + // Collect columns + const colNames: string[] = []; + const colArrays: Scalar[][] = []; + + if (writeIndex) { + colNames.push("_index"); + colArrays.push([...df.index.toArray()]); + } + for (const name of df.columns.values) { + colNames.push(name); + colArrays.push([...df.col(name).toArray()]); + } + + const nvar = colNames.length; + const nobs = df.shape[0]; + + // Determine Stata type for each column + const stataTypes: number[] = []; + for (let ci = 0; ci < nvar; ci++) { + const arr = colArrays[ci] ?? []; + let hasStr = false; + let maxStrLen = 0; + let allBoolOrNum = true; + let allBool = true; + for (const v of arr) { + if (v === null || v === undefined) continue; + if (typeof v === "string") { + hasStr = true; + allBoolOrNum = false; + allBool = false; + const len = ENC.encode(v).length; + if (len > maxStrLen) maxStrLen = len; + } else if (typeof v !== "boolean") { + allBool = false; + } + } + if (hasStr) { + stataTypes.push(Math.max(1, Math.min(maxStrLen, 2045))); + } else if (allBool && allBoolOrNum) { + stataTypes.push(TC_BYTE); + } else { + stataTypes.push(TC_DOUBLE); + } + } + + // Compute row width + let rowWidth = 0; + for (const t of stataTypes) { + if (t <= 2045) rowWidth += t; + else if (t === TC_BYTE) rowWidth += 1; + else if (t === TC_INT) rowWidth += 2; + else if (t === TC_LONG || t === TC_FLOAT) rowWidth += 4; + else rowWidth += 8; // TC_DOUBLE + } + + // Encode data label (UTF-8, max 80 bytes) + const labelRaw = dataLabel.length > 80 ? dataLabel.slice(0, 80) : dataLabel; + const labelBytes = ENC.encode(labelRaw); + + // Format timestamp: "dd Mon YYYY HH:MM" (always 17 bytes) + const now = new Date(); + const mos = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; + const tsStr = [ + String(now.getUTCDate()).padStart(2, " "), + mos[now.getUTCMonth()] ?? "Jan", + String(now.getUTCFullYear()), + `${String(now.getUTCHours()).padStart(2, "0")}:${String(now.getUTCMinutes()).padStart(2, "0")}`, + ].join(" "); + const tsBytes = ENC.encode(tsStr); + + const w = new BinWriter(65536); + const mapSlots: number[] = []; // positions of each map uint64 in the output + + // Track offsets as we write sections + const sectionOffs = new Array(14).fill(0); + sectionOffs[0] = 0; // + + // ── ── + w.writeAscii(""); + + // ──
── + w.writeAscii("
"); + w.writeAscii("118"); + w.writeAscii("LSF"); + w.writeAscii(""); + w.writeU16(nvar); + w.writeAscii(""); + w.writeAscii(""); + w.writeU32(nobs); + w.writeAscii(""); + w.writeAscii(""); + w.writeAscii(""); + w.writeU16(tsBytes.length); + w.writeBytes(tsBytes); + w.writeAscii(""); + w.writeAscii("
"); + + // ── ── + sectionOffs[1] = w.pos; + w.writeAscii(""); + const mapDataStart = w.pos; // position of first uint64 in map + for (let i = 0; i < 14; i++) { + mapSlots.push(mapDataStart + i * 8); + w.writeU64(0); // placeholder + } + w.writeAscii(""); + + // ── ── + sectionOffs[2] = w.pos; + w.writeAscii(""); + for (const t of stataTypes) w.writeU16(t); + w.writeAscii(""); + + // ── ── + sectionOffs[3] = w.pos; + w.writeAscii(""); + for (const name of colNames) w.writeFixed(name.slice(0, 32), 33); + w.writeAscii(""); + + // ── ── + sectionOffs[4] = w.pos; + w.writeAscii(""); + for (let i = 0; i <= nvar; i++) w.writeU16(0); + w.writeAscii(""); + + // ── ── + sectionOffs[5] = w.pos; + w.writeAscii(""); + for (let ci = 0; ci < nvar; ci++) { + const t = stataTypes[ci] ?? TC_DOUBLE; + let fmt: string; + if (t <= 2045) { + fmt = `%${t}s`; + } else if (t === TC_BYTE || t === TC_INT) { + fmt = "%8.0g"; + } else if (t === TC_LONG) { + fmt = "%12.0g"; + } else if (t === TC_FLOAT) { + fmt = "%9.0g"; + } else { + fmt = "%10.0g"; // TC_DOUBLE + } + w.writeFixed(fmt, 57); + } + w.writeAscii(""); + + // ── ── + sectionOffs[6] = w.pos; + w.writeAscii(""); + for (let i = 0; i < nvar; i++) w.writeFixed("", 33); + w.writeAscii(""); + + // ── ── + sectionOffs[7] = w.pos; + w.writeAscii(""); + for (const name of colNames) { + const lbl = variableLabels[name] ?? ""; + w.writeFixed(lbl.slice(0, 80), 81); + } + w.writeAscii(""); + + // ── (empty) ── + sectionOffs[8] = w.pos; + w.writeAscii(""); + w.writeAscii(""); + + // ── ── + sectionOffs[9] = w.pos; + w.writeAscii(""); + for (let ri = 0; ri < nobs; ri++) { + for (let ci = 0; ci < nvar; ci++) { + const t = stataTypes[ci] ?? TC_DOUBLE; + const v = (colArrays[ci] ?? [])[ri] ?? null; + if (t <= 2045) { + // str: write bytes then null-pad to field length + const s = typeof v === "string" ? v : v !== null && v !== undefined ? String(v) : ""; + const sb = ENC.encode(s); + const n = Math.min(sb.length, t); + for (let j = 0; j < n; j++) w.writeU8(sb[j] ?? 0); + for (let j = n; j < t; j++) w.writeU8(0); + } else if (t === TC_BYTE) { + if (v === null || v === undefined) { + w.writeI8(MISS_BYTE); + } else { + const bv = typeof v === "boolean" ? (v ? 1 : 0) : Math.round(Number(v)); + w.writeI8(Math.max(-127, Math.min(100, bv))); + } + } else if (t === TC_INT) { + if (v === null || v === undefined) { + w.writeI16(MISS_INT); + } else { + w.writeI16(Math.max(-32767, Math.min(32740, Math.round(Number(v))))); + } + } else if (t === TC_LONG) { + if (v === null || v === undefined) { + w.writeI32(MISS_LONG); + } else { + w.writeI32(Math.max(-2147483647, Math.min(2147483620, Math.round(Number(v))))); + } + } else if (t === TC_FLOAT) { + if (v === null || v === undefined) { + w.writeU32(MISS_F32_BITS); + } else { + w.writeF32(Number(v)); + } + } else { + // TC_DOUBLE + if (v === null || v === undefined) { + // Write Stata double missing pattern (little-endian: low word first) + w.writeU32(MISS_F64_LO32); + w.writeU32(MISS_F64_HI32); + } else { + w.writeF64(Number(v)); + } + } + } + } + w.writeAscii(""); + + // ── (empty) ── + sectionOffs[10] = w.pos; + w.writeAscii(""); + w.writeAscii(""); + + // ── (empty) ── + sectionOffs[11] = w.pos; + w.writeAscii(""); + w.writeAscii(""); + + // ── ── + sectionOffs[12] = w.pos; // end-of-data marker + w.writeAscii(""); + + // Patch the map with actual section offsets + for (let i = 0; i < 14; i++) { + const slotPos = mapSlots[i]; + if (slotPos !== undefined) { + w.patchU64(slotPos, sectionOffs[i] ?? 0); + } + } + + return w.finalize(); +} diff --git a/src/io/xml.ts b/src/io/xml.ts new file mode 100644 index 00000000..d343e916 --- /dev/null +++ b/src/io/xml.ts @@ -0,0 +1,523 @@ +/** + * readXml / toXml — XML I/O for DataFrame. + * + * Mirrors `pandas.read_xml()` and `DataFrame.to_xml()`: + * - `readXml(text, options?)` — parse an XML string into a DataFrame + * - `toXml(df, options?)` — serialize a DataFrame to an XML string + * + * Implemented without any external dependencies — uses a hand-rolled + * zero-dependency XML tokenizer that handles: + * - Attributes on row elements + * - Text-content child elements as columns + * - xmlns namespace prefixes (stripped for column names) + * - CDATA sections + * - XML comments (skipped) + * - Entity references (& < > ' " &#N; &#xN;) + * - nrows, usecols, xpath-like row selection (element name filter) + * - naValues, converters (auto-numeric coercion) + * - indexCol + * + * @module + */ + +import { DataFrame } from "../core/frame.ts"; +import { Index } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +function isLabel(v: Scalar): v is Label { + return ( + v === null || + typeof v === "number" || + typeof v === "string" || + typeof v === "boolean" || + v instanceof Date + ); +} + +// ─── public types ───────────────────────────────────────────────────────────── + +/** Options for {@link readXml}. */ +export interface ReadXmlOptions { + /** + * Local-name of the element to treat as a row. Defaults to the first + * repeating child element name found inside the document root. + */ + readonly rowTag?: string; + + /** + * Column name or 0-based column index to use as the row index. + * Defaults to a plain RangeIndex. + */ + readonly indexCol?: string | number | null; + + /** + * Only include these column names (subset). `null` = all columns. + */ + readonly usecols?: readonly string[] | null; + + /** + * Extra strings to treat as NaN in addition to the built-in defaults + * (`""`, `"NA"`, `"NaN"`, `"N/A"`, `"null"`, `"None"`, `"nan"`). + */ + readonly naValues?: readonly string[]; + + /** + * Whether to try to coerce column values to numbers. Defaults to `true`. + */ + readonly converters?: boolean; + + /** + * Maximum number of rows to read. Defaults to unlimited. + */ + readonly nrows?: number; + + /** + * Whether to read element attributes as columns. Defaults to `true`. + */ + readonly attribs?: boolean; + + /** + * Whether to read child element text content as columns. Defaults to `true`. + */ + readonly elems?: boolean; +} + +/** Options for {@link toXml}. */ +export interface ToXmlOptions { + /** + * Name of the document root element. Defaults to `"data"`. + */ + readonly rootName?: string; + + /** + * Name of each row element. Defaults to `"row"`. + */ + readonly rowName?: string; + + /** + * Emit column values as XML attributes instead of child elements. + * Defaults to `false`. + */ + readonly attribs?: boolean; + + /** + * Whether to include the `` declaration. + * Defaults to `true`. + */ + readonly xmlDeclaration?: boolean; + + /** + * Map of prefix → namespace URI to declare on the root element. + * E.g. `{ xsi: "http://www.w3.org/2001/XMLSchema-instance" }`. + */ + readonly namespaces?: Readonly>; + + /** + * Indentation string (spaces or `"\t"`). Defaults to `" "` (2 spaces). + * Set to `""` or `null` to disable indentation. + */ + readonly indent?: string | null; + + /** + * Names of columns whose values should be wrapped in a CDATA section. + */ + readonly cdataCols?: readonly string[]; +} + +// ─── default NA strings ─────────────────────────────────────────────────────── + +const DEFAULT_NA: readonly string[] = ["", "NA", "NaN", "N/A", "null", "None", "nan"]; + +// ─── entity decoding ────────────────────────────────────────────────────────── + +const NAMED_ENTITIES: Readonly> = { + amp: "&", + lt: "<", + gt: ">", + apos: "'", + quot: '"', + nbsp: "\u00a0", +}; + +function decodeEntities(s: string): string { + return s.replace(/&([^;]+);/g, (_, ref: string) => { + if (ref.startsWith("#x") || ref.startsWith("#X")) { + const cp = Number.parseInt(ref.slice(2), 16); + return Number.isNaN(cp) ? `&${ref};` : String.fromCodePoint(cp); + } + if (ref.startsWith("#")) { + const cp = Number.parseInt(ref.slice(1), 10); + return Number.isNaN(cp) ? `&${ref};` : String.fromCodePoint(cp); + } + return NAMED_ENTITIES[ref] ?? `&${ref};`; + }); +} + +// ─── entity encoding ────────────────────────────────────────────────────────── + +function encodeEntities(s: string): string { + return s + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +// ─── local name (strip namespace prefix) ────────────────────────────────────── + +function localName(qname: string): string { + const colon = qname.indexOf(":"); + return colon === -1 ? qname : qname.slice(colon + 1); +} + +// ─── sanitize column name for use as an XML element/attribute name ──────────── + +/** + * Convert a column name to a valid XML Name token. + * + * XML Name start character: letter or `_` (colon excluded for simplicity). + * XML Name character: letter, digit, `.`, `-`, `_`. + * Any invalid character is replaced with `_`. + */ +function toXmlName(name: string): string { + if (name.length === 0) { + return "_empty"; + } + const sanitized = name.replace(/[^A-Za-z0-9._-]/g, "_"); + // If the first character is a digit or hyphen/dot it's an invalid start char. + return /^[A-Za-z_]/.test(sanitized) ? sanitized : `_${sanitized}`; +} + +type Token = + | { kind: "open"; name: string; attrs: Record; selfClose: boolean } + | { kind: "close"; name: string } + | { kind: "text"; text: string } + | { kind: "pi" } + | { kind: "comment" } + | { kind: "doctype" }; + +function tokenize(xml: string): Token[] { + const tokens: Token[] = []; + let pos = 0; + const len = xml.length; + + while (pos < len) { + if (xml[pos] !== "<") { + // text node + const end = xml.indexOf("<", pos); + const raw = end === -1 ? xml.slice(pos) : xml.slice(pos, end); + tokens.push({ kind: "text", text: decodeEntities(raw) }); + pos = end === -1 ? len : end; + continue; + } + // starts with < + if (xml.startsWith("", pos + 4); + tokens.push({ kind: "comment" }); + pos = end === -1 ? len : end + 3; + continue; + } + if (xml.startsWith("", pos + 9); + const text = end === -1 ? xml.slice(pos + 9) : xml.slice(pos + 9, end); + tokens.push({ kind: "text", text }); + pos = end === -1 ? len : end + 3; + continue; + } + if (xml.startsWith("", pos + 2); + tokens.push({ kind: "pi" }); + pos = end === -1 ? len : end + 2; + continue; + } + if (xml.startsWith("", pos + 2); + tokens.push({ kind: "doctype" }); + pos = end === -1 ? len : end + 1; + continue; + } + if (xml[pos + 1] === "/") { + // closing tag + const end = xml.indexOf(">", pos + 2); + const raw = end === -1 ? xml.slice(pos + 2) : xml.slice(pos + 2, end); + tokens.push({ kind: "close", name: raw.trim() }); + pos = end === -1 ? len : end + 1; + continue; + } + // opening tag + const end = xml.indexOf(">", pos + 1); + if (end === -1) { + pos = len; + continue; + } + const inner = xml.slice(pos + 1, end); + const selfClose = inner.endsWith("/"); + const tagContent = selfClose ? inner.slice(0, -1) : inner; + // parse tag name and attributes + const match = /^([^\s/]+)([\s\S]*)$/.exec(tagContent.trim()); + if (!match) { + pos = end + 1; + continue; + } + const [, rawName = "", attrStr = ""] = match; + const attrs: Record = {}; + // parse attributes: name="value" or name='value' + const attrRe = /([^\s=]+)\s*=\s*(?:"([^"]*)"|'([^']*)')/g; + let am: RegExpExecArray | null; + while ((am = attrRe.exec(attrStr)) !== null) { + const [, attrName = "", dq = "", sq = ""] = am; + attrs[localName(attrName)] = decodeEntities(dq || sq); + } + tokens.push({ kind: "open", name: rawName.trim(), attrs, selfClose }); + pos = end + 1; + } + return tokens; +} + +// ─── readXml ────────────────────────────────────────────────────────────────── + +/** + * Parse an XML string into a DataFrame. + * + * @example + * ```ts + * const xml = ` + * Alice30 + * Bob25 + * `; + * const df = readXml(xml); + * df.columns.toArray(); // ["id", "name", "age"] + * df.shape; // [2, 3] + * ``` + */ +export function readXml(text: string, options: ReadXmlOptions = {}): DataFrame { + const { + rowTag, + indexCol = null, + usecols = null, + naValues: extraNa = [], + converters = true, + nrows, + attribs = true, + elems = true, + } = options; + + const naSet = new Set([...DEFAULT_NA, ...extraNa]); + + const tokens = tokenize(text); + const rows: Array> = []; + + // Discover rowTag from first repeating child of root if not specified + let resolvedRowTag = rowTag; + if (!resolvedRowTag) { + const childCounts: Map = new Map(); + let depth = 0; + for (const tok of tokens) { + if (tok.kind === "open") { + depth++; + if (depth === 2) { + const n = localName(tok.name); + childCounts.set(n, (childCounts.get(n) ?? 0) + 1); + } + if (tok.selfClose && depth === 2) depth--; + } else if (tok.kind === "close") { + depth--; + } + } + // pick the element with the highest count (most repeated child of root) + let best = ""; + let bestCount = 0; + for (const [name, count] of childCounts) { + if (count > bestCount) { + bestCount = count; + best = name; + } + } + resolvedRowTag = best || "row"; + } + + // Parse rows + let depth = 0; + let inRow = false; + let currentRow: Record = {}; + let currentElem = ""; + let currentText = ""; + let rowCount = 0; + + for (const tok of tokens) { + if (tok.kind === "open") { + depth++; + if (!inRow && depth >= 2 && localName(tok.name) === resolvedRowTag) { + inRow = true; + currentRow = {}; + if (attribs) { + for (const [k, v] of Object.entries(tok.attrs)) { + currentRow[k] = v; + } + } + if (tok.selfClose) { + inRow = false; + rows.push({ ...currentRow }); + rowCount++; + if (nrows !== undefined && rowCount >= nrows) break; + } + } else if (inRow && elems) { + currentElem = localName(tok.name); + currentText = ""; + // self-closing child elem → null + if (tok.selfClose) { + currentRow[currentElem] = null; + currentElem = ""; + } + } + if (tok.selfClose) depth--; + } else if (tok.kind === "text") { + if (inRow && currentElem) { + currentText += tok.text; + } + } else if (tok.kind === "close") { + const cln = localName(tok.name); + if (inRow && elems && currentElem && cln === currentElem) { + currentRow[currentElem] = currentText; + currentElem = ""; + currentText = ""; + } else if (inRow && cln === resolvedRowTag) { + inRow = false; + rows.push({ ...currentRow }); + rowCount++; + if (nrows !== undefined && rowCount >= nrows) break; + } + depth--; + } + } + + if (rows.length === 0) { + return DataFrame.fromColumns({}); + } + + // Collect all column names in order of first appearance + const colSet = new Set(); + for (const row of rows) { + for (const k of Object.keys(row)) colSet.add(k); + } + let cols = [...colSet]; + if (usecols) cols = cols.filter((c) => usecols.includes(c)); + + // Build column arrays + const colData: Record = {}; + for (const col of cols) { + colData[col] = rows.map((row) => { + const raw = row[col] ?? null; + if (raw === null || naSet.has(raw)) return null; + if (converters) { + const n = Number(raw); + if (!Number.isNaN(n) && raw.trim() !== "") return n; + } + return raw; + }); + } + + // Determine index + let idxCol: string | null = null; + if (typeof indexCol === "string") { + idxCol = indexCol; + } else if (typeof indexCol === "number" && indexCol < cols.length) { + idxCol = cols[indexCol] ?? null; + } + + if (idxCol !== null && cols.includes(idxCol)) { + const idxData = colData[idxCol] ?? []; + const dataColNames = cols.filter((c) => c !== idxCol); + const dataColData: Record = {}; + for (const c of dataColNames) { + dataColData[c] = colData[c] ?? []; + } + const idx = new Index(idxData.filter(isLabel)); + return DataFrame.fromColumns(dataColData, { index: idx }); + } + + return DataFrame.fromColumns(colData); +} + +// ─── toXml ──────────────────────────────────────────────────────────────────── + +/** + * Serialize a DataFrame to an XML string. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ name: ["Alice", "Bob"], age: [30, 25] }); + * console.log(toXml(df)); + * // + * // + * // Alice30 + * // Bob25 + * // + * ``` + */ +export function toXml(df: DataFrame, options: ToXmlOptions = {}): string { + const { + rootName = "data", + rowName = "row", + attribs = false, + xmlDeclaration = true, + namespaces = {}, + indent = " ", + cdataCols = [], + } = options; + + const ind = indent ?? ""; + const nl = ind ? "\n" : ""; + + const lines: string[] = []; + + if (xmlDeclaration) { + lines.push(''); + } + + // Root element opening with optional namespace declarations + const nsAttrs = Object.entries(namespaces) + .map(([prefix, uri]) => ` xmlns:${prefix}="${encodeEntities(uri)}"`) + .join(""); + lines.push(`<${rootName}${nsAttrs}>`); + + const columns = df.columns.toArray(); + const nRows = df.shape[0]; + + for (let i = 0; i < nRows; i++) { + const rowValues: string[] = []; + for (const col of columns) { + const series = df.col(col); + const val = series.iloc(i); + rowValues.push(val === null || val === undefined ? "" : String(val)); + } + + if (attribs) { + // emit as attributes on the row element + const attrStr = columns + .map((c, j) => `${toXmlName(c)}="${encodeEntities(rowValues[j] ?? "")}"`) + .join(" "); + lines.push(`${ind}<${rowName} ${attrStr}/>`); + } else { + // emit as child elements + const childLines: string[] = []; + for (let j = 0; j < columns.length; j++) { + const col = columns[j] ?? ""; + const tag = toXmlName(col); + const raw = rowValues[j] ?? ""; + const isCdata = cdataCols.includes(col); + const content = isCdata ? `` : encodeEntities(raw); + childLines.push(`${ind}${ind}<${tag}>${content}`); + } + if (childLines.length === 0) { + lines.push(`${ind}<${rowName}/>`); + } else { + lines.push(`${ind}<${rowName}>${nl}${childLines.join(nl)}${nl}${ind}`); + } + } + } + + lines.push(``); + return lines.join(nl) + nl; +} diff --git a/src/reshape/index.ts b/src/reshape/index.ts index 6e03a5c3..3f132c43 100644 --- a/src/reshape/index.ts +++ b/src/reshape/index.ts @@ -14,3 +14,5 @@ export { wideToLong } from "./wide_to_long.ts"; export type { WideToLongOptions } from "./wide_to_long.ts"; export { pivotTableFull } from "./pivot_table.ts"; export type { PivotTableFullOptions } from "./pivot_table.ts"; +export { lreshape } from "./lreshape.ts"; +export type { LreshapeGroups, LreshapeOptions } from "./lreshape.ts"; diff --git a/src/reshape/lreshape.ts b/src/reshape/lreshape.ts new file mode 100644 index 00000000..ff89fdd1 --- /dev/null +++ b/src/reshape/lreshape.ts @@ -0,0 +1,197 @@ +/** + * lreshape — reshape wide-format data to long format using named column groups. + * + * Mirrors `pandas.lreshape(data, groups, dropna=True)`: + * - `data`: source DataFrame + * - `groups`: mapping from long-format column name → list of wide-format column names + * - `dropna`: when `true` (default), drop rows where any value column is `null`/`undefined`/`NaN` + * + * Each key in `groups` becomes a column in the output. The values (lists of column + * names) must all have the same length. The function stacks them vertically such + * that the first element of each list forms the first block of rows, the second + * element forms the second block, and so on. + * + * All columns in `data` that are **not** mentioned in any group value list become + * identity (id) columns — they are repeated for each block. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ + * hr: [14, 7], + * team: ["Red", "Blue"], + * v1: [1, 3], + * v2: [2, 4], + * }); + * lreshape(df, { v: ["v1", "v2"] }); + * // hr team v + * // 14 Red 1 + * // 7 Blue 3 + * // 14 Red 2 + * // 7 Blue 4 + * ``` + * + * @module + */ + +import { DataFrame } from "../core/index.ts"; +import type { Index } from "../core/index.ts"; +import { RangeIndex } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── public types ────────────────────────────────────────────────────────────── + +/** + * Groups argument for {@link lreshape}. + * + * Maps each output column name to an ordered list of input column names. + * All lists must have the same length. + */ +export type LreshapeGroups = Record; + +/** Options for {@link lreshape}. */ +export interface LreshapeOptions { + /** + * When `true` (default), rows where **any** value column is `null`, + * `undefined`, or `NaN` are dropped from the result. + */ + readonly dropna?: boolean; +} + +// ─── helpers ────────────────────────────────────────────────────────────────── + +/** True when a scalar is considered missing: null, undefined, or NaN. */ +function isMissing(v: Scalar): boolean { + return v === null || v === undefined || (typeof v === "number" && Number.isNaN(v)); +} + +// ─── lreshape ───────────────────────────────────────────────────────────────── + +/** + * Reshape wide-format data to long format. + * + * Each entry in `groups` maps an output column name to a list of input column + * names that should be stacked into that output column. The input lists must + * all have the same length `k`; the function produces `nRows * k` output rows. + * + * Columns not mentioned in any group value list are treated as id columns and + * are repeated for every block. + * + * @param data - Source DataFrame (wide format). + * @param groups - Mapping from long-format column name → wide-format column list. + * @param options - {@link LreshapeOptions} + * @returns A new long-format DataFrame. + * + * @example + * ```ts + * const df = DataFrame.fromColumns({ + * A: ["a", "b"], + * B1: [1, 2], + * B2: [3, 4], + * }); + * lreshape(df, { B: ["B1", "B2"] }); + * // A B + * // a 1 + * // b 2 + * // a 3 + * // b 4 + * ``` + */ +export function lreshape( + data: DataFrame, + groups: LreshapeGroups, + options?: LreshapeOptions, +): DataFrame { + const dropna = options?.dropna ?? true; + + const groupKeys = Object.keys(groups); + + if (groupKeys.length === 0) { + // No groups → return a copy with only id columns (same as no value cols) + return data; + } + + // Validate: all group lists must have the same length + const firstKey = groupKeys[0] as string; + const firstList = groups[firstKey] as readonly string[]; + const k = firstList.length; + + for (const key of groupKeys) { + const list = groups[key] as readonly string[]; + if (list.length !== k) { + throw new Error( + `lreshape: all group lists must have the same length, but "${firstKey}" has length ${k} and "${key}" has length ${list.length}`, + ); + } + } + + // Validate: all referenced columns must exist in `data` + const allGroupCols = new Set(); + for (const key of groupKeys) { + const list = groups[key] as readonly string[]; + for (const col of list) { + allGroupCols.add(col); + if (!data.columns.values.includes(col)) { + throw new Error(`lreshape: column "${col}" not found in DataFrame`); + } + } + } + + // Determine id columns: all data columns NOT mentioned in any group + const idCols = data.columns.values.filter((c) => !allGroupCols.has(c)); + + const nRows = data.index.size; + + // Output arrays: id columns + group output columns + const outData: Record = {}; + for (const id of idCols) { + outData[id] = []; + } + for (const key of groupKeys) { + outData[key] = []; + } + let totalRows = 0; + + // Iterate block by block (one block per position in each group list) + for (let blockIdx = 0; blockIdx < k; blockIdx++) { + // For each row in the source + for (let ri = 0; ri < nRows; ri++) { + // Collect value-column values for this row in this block + const blockValues: Scalar[] = []; + for (const key of groupKeys) { + const list = groups[key] as readonly string[]; + const srcCol = list[blockIdx] as string; + const val: Scalar = data.col(srcCol).iat(ri); + blockValues.push(val); + } + + // Apply dropna filter + if (dropna && blockValues.some((v) => isMissing(v))) { + continue; + } + + totalRows++; + + // Id columns + for (const id of idCols) { + const col = outData[id]; + if (col !== undefined) { + col.push(data.col(id).iat(ri)); + } + } + + // Value columns + for (let vi = 0; vi < groupKeys.length; vi++) { + const key = groupKeys[vi] as string; + const col = outData[key]; + if (col !== undefined) { + const bv = blockValues[vi]; + col.push(bv !== undefined ? bv : null); + } + } + } + } + + const resultIndex: Index
k
A
`; + const html = "
k
A
"; const [df] = readHtml(html, { converters: false }); expect(df!.col("k").toArray()[0]).toBe("A"); }); test("decodes &#xHH; hex entities", () => { - const html = `
k
B
`; + const html = "
k
B
"; const [df] = readHtml(html, { converters: false }); expect(df!.col("k").toArray()[0]).toBe("B"); }); diff --git a/tests/io/read_table.test.ts b/tests/io/read_table.test.ts new file mode 100644 index 00000000..b2c8e2d2 --- /dev/null +++ b/tests/io/read_table.test.ts @@ -0,0 +1,313 @@ +/** + * Tests for src/io/read_table.ts — readTable(). + * + * Mirrors pandas.read_table() test suite: + * - default tab separator + * - custom separator + * - all ReadCsvOptions are forwarded + * - property-based round-trips + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, readCsv, readTable } from "../../src/index.ts"; + +// ─── basic parsing ──────────────────────────────────────────────────────────── + +describe("readTable — basic TSV parsing", () => { + it("parses a simple tab-separated file", () => { + const tsv = "name\tage\tcity\nAlice\t30\tNY\nBob\t25\tLA"; + const df = readTable(tsv); + expect(df.shape).toEqual([2, 3]); + expect([...df.columns.values]).toEqual(["name", "age", "city"]); + expect([...df.col("name").values]).toEqual(["Alice", "Bob"]); + expect([...df.col("age").values]).toEqual([30, 25]); + expect([...df.col("city").values]).toEqual(["NY", "LA"]); + }); + + it("infers integer dtype for numeric columns", () => { + const tsv = "x\ty\n1\t2\n3\t4"; + const df = readTable(tsv); + expect(df.col("x").dtype.name).toBe("int64"); + expect(df.col("y").dtype.name).toBe("int64"); + }); + + it("infers float dtype", () => { + const tsv = "a\tb\n1.5\t2.7\n3.1\t4.9"; + const df = readTable(tsv); + expect(df.col("a").dtype.name).toBe("float64"); + }); + + it("keeps string columns as object dtype", () => { + const tsv = "name\tval\nAlice\t10\nBob\t20"; + const df = readTable(tsv); + expect(df.col("name").dtype.name).toBe("object"); + }); + + it("handles a single column", () => { + const tsv = "x\n1\n2\n3"; + const df = readTable(tsv); + expect(df.shape).toEqual([3, 1]); + expect([...df.col("x").values]).toEqual([1, 2, 3]); + }); + + it("handles empty file (header only)", () => { + const tsv = "a\tb\tc"; + const df = readTable(tsv); + expect(df.shape).toEqual([0, 3]); + }); + + it("handles NA values in columns", () => { + const tsv = "a\tb\n1\tNA\n2\t3"; + const df = readTable(tsv); + expect(Number.isNaN(df.col("b").values[0])).toBe(true); + expect(df.col("b").values[1]).toBe(3); + }); + + it("handles empty string fields as NaN for numeric columns", () => { + const tsv = "a\tb\n1\t\n2\t4"; + const df = readTable(tsv); + expect(Number.isNaN(df.col("b").values[0])).toBe(true); + }); +}); + +// ─── custom separator ───────────────────────────────────────────────────────── + +describe("readTable — custom separator", () => { + it("uses comma separator when explicitly passed", () => { + const csv = "a,b,c\n1,2,3"; + const df = readTable(csv, { sep: "," }); + expect(df.shape).toEqual([1, 3]); + expect([...df.col("a").values]).toEqual([1]); + }); + + it("uses pipe separator", () => { + const piped = "a|b|c\n1|2|3\n4|5|6"; + const df = readTable(piped, { sep: "|" }); + expect(df.shape).toEqual([2, 3]); + expect([...df.col("b").values]).toEqual([2, 5]); + }); + + it("uses semicolon separator", () => { + const text = "x;y\n10;20\n30;40"; + const df = readTable(text, { sep: ";" }); + expect([...df.col("x").values]).toEqual([10, 30]); + expect([...df.col("y").values]).toEqual([20, 40]); + }); + + it("uses multi-char separator", () => { + const text = "a::b::c\n1::2::3"; + const df = readTable(text, { sep: "::" }); + expect([...df.col("a").values]).toEqual([1]); + expect([...df.col("c").values]).toEqual([3]); + }); +}); + +// ─── ReadCsvOptions forwarding ──────────────────────────────────────────────── + +describe("readTable — ReadCsvOptions forwarding", () => { + it("respects indexCol option", () => { + const tsv = "id\tval\n1\t10\n2\t20"; + const df = readTable(tsv, { indexCol: "id" }); + expect([...df.index.values]).toEqual([1, 2]); + expect([...df.columns.values]).toEqual(["val"]); + }); + + it("respects nRows option", () => { + const tsv = "a\tb\n1\t2\n3\t4\n5\t6"; + const df = readTable(tsv, { nRows: 2 }); + expect(df.shape).toEqual([2, 2]); + expect([...df.col("a").values]).toEqual([1, 3]); + }); + + it("respects skipRows option", () => { + const tsv = "a\tb\n1\t2\n3\t4\n5\t6"; + const df = readTable(tsv, { skipRows: 1 }); + expect(df.shape).toEqual([2, 2]); + expect([...df.col("a").values]).toEqual([3, 5]); + }); + + it("respects header: null (no header row)", () => { + const tsv = "1\t2\t3\n4\t5\t6"; + const df = readTable(tsv, { header: null }); + expect(df.shape).toEqual([2, 3]); + // Columns are auto-assigned (0, 1, 2) + expect(df.columns.size).toBe(3); + }); + + it("respects dtype option", () => { + const tsv = "x\ty\n1\t2\n3\t4"; + const df = readTable(tsv, { dtype: { x: "float64" } }); + expect(df.col("x").dtype.name).toBe("float64"); + }); + + it("respects naValues option", () => { + const tsv = "a\tb\n1\tMISSING\n2\t3"; + const df = readTable(tsv, { naValues: ["MISSING"] }); + expect(Number.isNaN(df.col("b").values[0])).toBe(true); + expect(df.col("b").values[1]).toBe(3); + }); +}); + +// ─── default vs explicit separator ─────────────────────────────────────────── + +describe("readTable vs readCsv — default separator difference", () => { + it("readTable defaults to tab; readCsv defaults to comma", () => { + const tsv = "a\tb\n1\t2"; + const csv = "a,b\n1,2"; + + const dfTable = readTable(tsv); + const dfCsv = readCsv(csv); + + expect([...dfTable.columns.values]).toEqual(["a", "b"]); + expect([...dfCsv.columns.values]).toEqual(["a", "b"]); + expect([...dfTable.col("a").values]).toEqual([1]); + expect([...dfCsv.col("a").values]).toEqual([1]); + }); + + it("readTable with comma-sep text treats entire line as single column", () => { + // Default sep=\t — commas are NOT separators + const csv = "a,b\n1,2\n3,4"; + const df = readTable(csv); + // The whole "a,b" is one column name + expect(df.columns.size).toBe(1); + }); +}); + +// ─── whitespace and edge cases ──────────────────────────────────────────────── + +describe("readTable — edge cases", () => { + it("handles trailing newline", () => { + const tsv = "a\tb\n1\t2\n"; + const df = readTable(tsv); + expect(df.shape).toEqual([1, 2]); + }); + + it("handles Windows-style CRLF", () => { + const tsv = "a\tb\r\n1\t2\r\n3\t4\r\n"; + const df = readTable(tsv); + expect(df.shape).toEqual([2, 2]); + expect([...df.col("a").values]).toEqual([1, 3]); + }); + + it("handles a large file", () => { + const rows = Array.from({ length: 1000 }, (_, i) => `${i}\t${i * 2}`); + const tsv = `idx\tval\n${rows.join("\n")}`; + const df = readTable(tsv); + expect(df.shape).toEqual([1000, 2]); + expect(df.col("idx").values[999]).toBe(999); + expect(df.col("val").values[999]).toBe(1998); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("readTable — property-based", () => { + it("round-trips integer data through tab-separated format", () => { + fc.assert( + fc.property( + fc.array( + fc.record({ + a: fc.integer({ min: -1000, max: 1000 }), + b: fc.integer({ min: 0, max: 9999 }), + }), + { minLength: 1, maxLength: 50 }, + ), + (rows) => { + const lines = ["a\tb", ...rows.map((r) => `${r.a}\t${r.b}`)]; + const tsv = lines.join("\n"); + const df = readTable(tsv); + expect(df.shape).toEqual([rows.length, 2]); + for (let i = 0; i < rows.length; i++) { + expect(df.col("a").values[i]).toBe(rows[i]!.a); + expect(df.col("b").values[i]).toBe(rows[i]!.b); + } + }, + ), + ); + }); + + it("produces same result as readCsv with matching sep", () => { + fc.assert( + fc.property( + fc.array( + fc.record({ + x: fc.float({ min: -100, max: 100, noNaN: true }), + }), + { minLength: 1, maxLength: 30 }, + ), + (rows) => { + const lines = ["x", ...rows.map((r) => String(r.x))]; + const tsv = lines.join("\n"); + const dfTable = readTable(tsv, { sep: "\t" }); + const dfCsv = readCsv(tsv.replaceAll("\t", "\t"), { sep: "\t" }); + expect(dfTable.shape).toEqual(dfCsv.shape); + }, + ), + ); + }); + + it("readTable with explicit sep matches readCsv with same sep", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: 0, max: 9999 }), { minLength: 1, maxLength: 20 }), + (vals) => { + const lines = ["v", ...vals.map(String)]; + const text = lines.join("\n"); + const dfTable = readTable(text); + // Default sep=\t, and our data has no tabs, so single col + // Just check shape is valid + expect(dfTable.shape[0]).toBe(vals.length); + }, + ), + ); + }); + + it("comma-sep round-trip: readTable({sep:','}) equals readCsv", () => { + fc.assert( + fc.property( + fc.array( + fc.record({ + col1: fc.integer({ min: 0, max: 100 }), + col2: fc.integer({ min: 0, max: 100 }), + }), + { minLength: 1, maxLength: 40 }, + ), + (rows) => { + const csv = `col1,col2\n${rows.map((r) => `${r.col1},${r.col2}`).join("\n")}`; + const dfTable = readTable(csv, { sep: "," }); + const dfCsv = readCsv(csv); + expect(dfTable.shape).toEqual(dfCsv.shape); + for (let i = 0; i < rows.length; i++) { + expect(dfTable.col("col1").values[i]).toBe(dfCsv.col("col1").values[i]); + expect(dfTable.col("col2").values[i]).toBe(dfCsv.col("col2").values[i]); + } + }, + ), + ); + }); +}); + +// ─── DataFrame integration ──────────────────────────────────────────────────── + +describe("readTable — DataFrame integration", () => { + it("returns a proper DataFrame instance", () => { + const df = readTable("a\tb\n1\t2"); + expect(df).toBeInstanceOf(DataFrame); + }); + + it("can chain DataFrame methods after readTable", () => { + const tsv = "a\tb\tc\n1\t2\t3\n4\t5\t6\n7\t8\t9"; + const df = readTable(tsv); + const filtered = df.select(["a", "c"]); + expect(filtered.shape).toEqual([3, 2]); + expect([...filtered.columns.values]).toEqual(["a", "c"]); + }); + + it("supports multi-row operations on parsed data", () => { + const tsv = "x\ty\n10\t20\n30\t40\n50\t60"; + const df = readTable(tsv); + // Sum via reduce + const sumX = [...df.col("x").values].reduce((a, b) => (a as number) + (b as number), 0); + expect(sumX).toBe(90); + }); +}); diff --git a/tests/io/sql.test.ts b/tests/io/sql.test.ts new file mode 100644 index 00000000..936438ce --- /dev/null +++ b/tests/io/sql.test.ts @@ -0,0 +1,561 @@ +/** + * Tests for src/io/sql.ts — readSql, readSqlQuery, readSqlTable, toSql. + * + * Uses an in-memory MockAdapter that stores tables as arrays of row objects so + * all functionality can be exercised without an external database. + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, readSql, readSqlQuery, readSqlTable, toSql } from "../../src/index.ts"; +import type { + IfExistsStrategy, + SqlConnection, + SqlResult, + SqlRow, + SqlValue, +} from "../../src/index.ts"; +import { TableExistsError, TableNotFoundError } from "../../src/index.ts"; + +// ─── MockAdapter ────────────────────────────────────────────────────────────── + +/** + * Minimal in-memory SQL adapter for testing. + * + * Supports: + * - `SELECT * FROM ""` (exact pattern generated by readSqlTable) + * - `SELECT col1, col2 FROM "
"` (column projection) + * - `INSERT INTO "
" (...) VALUES (...)` (single-row inserts) + * - `DROP TABLE IF EXISTS "
"` + * - `listTables()` and `insert()` adapter methods + */ +class MockAdapter implements SqlConnection { + private readonly tables: Map = new Map(); + private readonly schemas: Map = new Map(); + + /** Seed a table with pre-existing data. */ + seed(name: string, rows: SqlRow[]): void { + this.tables.set( + name, + rows.map((r) => ({ ...r })), + ); + if (rows.length > 0) { + const first = rows[0]; + if (first !== undefined) { + this.schemas.set(name, Object.keys(first)); + } + } + } + + query(sql: string): SqlResult { + const trimmed = sql.trim(); + + // DROP TABLE IF EXISTS "" + const dropMatch = /^DROP TABLE IF EXISTS "(.+)"$/i.exec(trimmed); + if (dropMatch !== null) { + const name = dropMatch[1]; + if (name !== undefined) { + this.tables.delete(name); + this.schemas.delete(name); + } + return { columns: [], rows: [] }; + } + + // INSERT INTO "" (col, …) VALUES (val, …) + const insertMatch = /^INSERT INTO "(.+)" \((.+)\) VALUES \((.+)\)$/i.exec(trimmed); + if (insertMatch !== null) { + const [, rawName, rawCols, rawVals] = insertMatch; + if (rawName !== undefined && rawCols !== undefined && rawVals !== undefined) { + const cols = rawCols.split(",").map((c) => c.trim().replace(/^"|"$/g, "")); + const vals = parseValueList(rawVals); + const row: SqlRow = {}; + for (let i = 0; i < cols.length; i++) { + const col = cols[i]; + const val = vals[i]; + if (col !== undefined && val !== undefined) { + row[col] = val; + } + } + const existing = this.tables.get(rawName); + if (existing !== undefined) { + existing.push(row); + } else { + this.tables.set(rawName, [row]); + } + if (!this.schemas.has(rawName)) { + this.schemas.set(rawName, cols); + } + } + return { columns: [], rows: [] }; + } + + // SELECT … FROM "" + const selectMatch = /^SELECT\s+(.+?)\s+FROM\s+"([^"]+)"(?:\s*$)/i.exec(trimmed); + if (selectMatch !== null) { + const [, selectCols, rawName] = selectMatch; + if (rawName !== undefined && selectCols !== undefined) { + const rows = this.tables.get(rawName) ?? []; + const allCols = this.schemas.get(rawName) ?? (rows.length > 0 ? Object.keys(rows[0]!) : []); + const wantedCols = + selectCols.trim() === "*" + ? allCols + : selectCols.split(",").map((c) => c.trim().replace(/^"|"$/g, "")); + const resultRows: SqlRow[] = rows.map((r) => { + const out: SqlRow = {}; + for (const col of wantedCols) { + out[col] = r[col] ?? null; + } + return out; + }); + return { columns: wantedCols, rows: resultRows }; + } + } + + return { columns: [], rows: [] }; + } + + listTables(): readonly string[] { + return [...this.tables.keys()]; + } + + insert( + tableName: string, + rows: readonly SqlRow[], + columns: readonly string[], + ifExists: IfExistsStrategy, + ): number { + const existing = this.tables.get(tableName); + if (existing !== undefined) { + if (ifExists === "fail") { + throw new TableExistsError(tableName); + } + if (ifExists === "replace") { + this.tables.delete(tableName); + this.schemas.delete(tableName); + } + } + const arr = this.tables.get(tableName) ?? []; + for (const row of rows) { + arr.push({ ...row }); + } + this.tables.set(tableName, arr); + this.schemas.set(tableName, [...columns]); + return rows.length; + } + + /** Expose stored rows for assertions. */ + getRows(name: string): SqlRow[] { + return this.tables.get(name) ?? []; + } +} + +// ─── SQL literal parser for mock INSERT handling ────────────────────────────── + +function parseValueList(raw: string): SqlValue[] { + const values: SqlValue[] = []; + let i = 0; + + while (i < raw.length) { + while (i < raw.length && raw[i] === " ") i++; + if (i >= raw.length) break; + + const ch = raw[i]; + if (ch === undefined) break; + + if (ch === "N" && raw.slice(i, i + 4) === "NULL") { + values.push(null); + i += 4; + } else if (ch === "'") { + // String literal + i++; // skip opening quote + let s = ""; + while (i < raw.length) { + const c = raw[i]; + if (c === "'") { + if (raw[i + 1] === "'") { + s += "'"; + i += 2; + } else { + i++; + break; + } + } else { + s += c ?? ""; + i++; + } + } + values.push(s); + } else if (ch === "X" && raw[i + 1] === "'") { + // Hex blob: X'deadbeef' + i += 2; + let hex = ""; + while (i < raw.length && raw[i] !== "'") { + hex += raw[i]; + i++; + } + i++; // skip closing quote + const bytes = new Uint8Array(hex.length / 2); + for (let b = 0; b < bytes.length; b++) { + bytes[b] = Number.parseInt(hex.slice(b * 2, b * 2 + 2), 16); + } + values.push(bytes); + } else { + // Number + let numStr = ""; + while (i < raw.length && raw[i] !== "," && raw[i] !== " ") { + numStr += raw[i]; + i++; + } + const n = Number(numStr); + values.push(Number.isNaN(n) ? numStr : n); + } + + while (i < raw.length && raw[i] === " ") i++; + if (raw[i] === ",") i++; + } + + return values; +} + +// ─── readSqlQuery ───────────────────────────────────────────────────────────── + +describe("readSqlQuery — basic", () => { + it("returns a DataFrame with correct shape and values", () => { + const db = new MockAdapter(); + db.seed("users", [ + { id: 1, name: "Alice", score: 9.5 }, + { id: 2, name: "Bob", score: 7.0 }, + ]); + const df = readSqlQuery('SELECT * FROM "users"', db); + expect(df.shape).toEqual([2, 3]); + expect([...df.columns.values]).toEqual(["id", "name", "score"]); + expect([...df.col("id").values]).toEqual([1, 2]); + expect([...df.col("name").values]).toEqual(["Alice", "Bob"]); + }); + + it("respects indexCol (string)", () => { + const db = new MockAdapter(); + db.seed("t", [ + { id: 10, val: "a" }, + { id: 20, val: "b" }, + ]); + const df = readSqlQuery('SELECT * FROM "t"', db, { indexCol: "id" }); + expect(df.shape).toEqual([2, 1]); + expect([...df.columns.values]).toEqual(["val"]); + expect([...df.index.values]).toEqual([10, 20]); + expect(df.index.name).toBe("id"); + }); + + it("respects indexCol (number)", () => { + const db = new MockAdapter(); + db.seed("t", [{ id: 5, x: 1 }]); + const df = readSqlQuery('SELECT * FROM "t"', db, { indexCol: 0 }); + expect([...df.index.values]).toEqual([5]); + }); + + it("parses date columns", () => { + const db = new MockAdapter(); + db.seed("events", [{ dt: "2024-01-01", val: 1 }]); + const df = readSqlQuery('SELECT * FROM "events"', db, { + parseDates: ["dt"], + }); + const dtVal = df.col("dt").values[0]; + expect(typeof dtVal).toBe("number"); + const d = new Date(dtVal as number); + expect(d.getUTCFullYear()).toBe(2024); + }); + + it("null values stay null", () => { + const db = new MockAdapter(); + db.seed("t", [{ x: null }]); + const df = readSqlQuery('SELECT * FROM "t"', db); + expect(df.col("x").values[0]).toBeNull(); + }); + + it("returns empty DataFrame for empty result", () => { + const db = new MockAdapter(); + const result: SqlResult = { columns: ["a", "b"], rows: [] }; + const df = readSqlQuery("SELECT a, b FROM empty_table", { + query() { + return result; + }, + }); + expect(df.shape).toEqual([0, 2]); + expect([...df.columns.values]).toEqual(["a", "b"]); + }); +}); + +// ─── readSqlTable ───────────────────────────────────────────────────────────── + +describe("readSqlTable — basic", () => { + it("reads entire table", () => { + const db = new MockAdapter(); + db.seed("products", [ + { id: 1, name: "Widget", price: 9.99 }, + { id: 2, name: "Gadget", price: 24.99 }, + ]); + const df = readSqlTable("products", db); + expect(df.shape).toEqual([2, 3]); + expect([...df.col("price").values]).toEqual([9.99, 24.99]); + }); + + it("projects requested columns", () => { + const db = new MockAdapter(); + db.seed("products", [{ id: 1, name: "W", price: 1 }]); + const df = readSqlTable("products", db, { columns: ["id", "name"] }); + expect([...df.columns.values]).toEqual(["id", "name"]); + expect(df.shape).toEqual([1, 2]); + }); + + it("throws TableNotFoundError for unknown table", () => { + const db = new MockAdapter(); + expect(() => readSqlTable("missing", db)).toThrow(TableNotFoundError); + }); + + it("does not validate when listTables is absent", () => { + const minimalConn: SqlConnection = { + query(): SqlResult { + return { columns: ["x"], rows: [{ x: 1 }] }; + }, + }; + const df = readSqlTable("any_table", minimalConn); + expect(df.shape).toEqual([1, 1]); + }); +}); + +// ─── readSql ────────────────────────────────────────────────────────────────── + +describe("readSql — auto-detect", () => { + it("detects SQL query by whitespace", () => { + const db = new MockAdapter(); + db.seed("orders", [{ id: 1, amount: 100 }]); + const df = readSql('SELECT id, amount FROM "orders"', db); + expect(df.shape).toEqual([1, 2]); + }); + + it("detects table name (no whitespace)", () => { + const db = new MockAdapter(); + db.seed("orders", [{ id: 1 }, { id: 2 }]); + const df = readSql("orders", db); + expect(df.shape).toEqual([2, 1]); + }); +}); + +// ─── toSql ──────────────────────────────────────────────────────────────────── + +describe("toSql — basic", () => { + it("writes all rows and returns count", () => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ + name: ["Alice", "Bob"], + score: [100, 90], + }); + const written = toSql(df, "results", db); + expect(written).toBe(2); + const stored = db.getRows("results"); + expect(stored).toHaveLength(2); + }); + + it("writes index column when index: true (default)", () => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ x: [10, 20] }); + toSql(df, "t", db, { index: true }); + const rows = db.getRows("t"); + expect(rows[0]).toHaveProperty("index"); + expect(rows[0]!["index"]).toBe(0); + }); + + it("omits index column when index: false", () => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ x: [1, 2] }); + toSql(df, "t", db, { index: false }); + const rows = db.getRows("t"); + expect(rows[0]).not.toHaveProperty("index"); + expect(rows[0]).toHaveProperty("x"); + }); + + it("respects custom indexLabel", () => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ v: [99] }); + toSql(df, "t", db, { indexLabel: "row_id" }); + expect(db.getRows("t")[0]).toHaveProperty("row_id"); + }); + + it("ifExists: fail throws when table exists", () => { + const db = new MockAdapter(); + db.seed("t", [{ x: 1 }]); + const df = DataFrame.fromColumns({ x: [2] }); + expect(() => toSql(df, "t", db, { ifExists: "fail" })).toThrow(TableExistsError); + }); + + it("ifExists: replace overwrites data", () => { + const db = new MockAdapter(); + db.seed("t", [{ x: 1 }, { x: 2 }]); + const df = DataFrame.fromColumns({ x: [99] }); + toSql(df, "t", db, { ifExists: "replace", index: false }); + const rows = db.getRows("t"); + expect(rows).toHaveLength(1); + expect(rows[0]!["x"]).toBe(99); + }); + + it("ifExists: append adds to existing data", () => { + const db = new MockAdapter(); + db.seed("t", [{ x: 1 }]); + const df = DataFrame.fromColumns({ x: [2, 3] }); + toSql(df, "t", db, { ifExists: "append", index: false }); + const rows = db.getRows("t"); + expect(rows).toHaveLength(3); + }); + + it("returns 0 rows for empty DataFrame", () => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ x: [] as number[] }); + const n = toSql(df, "empty", db, { index: false }); + expect(n).toBe(0); + }); +}); + +// ─── toSql fallback (query-only adapter) ───────────────────────────────────── + +describe("toSql — fallback path (no insert method)", () => { + it("writes rows via INSERT statements", () => { + const inserted: string[] = []; + const queryConn: SqlConnection = { + query(sql: string): SqlResult { + inserted.push(sql); + return { columns: [], rows: [] }; + }, + }; + const df = DataFrame.fromColumns({ a: [1, 2], b: ["x", "y"] }); + const n = toSql(df, "dest", queryConn, { index: false }); + expect(n).toBe(2); + expect(inserted.some((s) => /INSERT INTO/.test(s))).toBe(true); + }); + + it("chunksize controls batch grouping", () => { + const calls: string[] = []; + const queryConn: SqlConnection = { + query(sql: string): SqlResult { + calls.push(sql); + return { columns: [], rows: [] }; + }, + }; + const df = DataFrame.fromColumns({ v: [1, 2, 3, 4, 5] }); + toSql(df, "t", queryConn, { index: false, chunksize: 2 }); + const inserts = calls.filter((s) => /INSERT INTO/.test(s)); + expect(inserts).toHaveLength(5); + }); + + it("handles null scalar values", () => { + const sqls: string[] = []; + const queryConn: SqlConnection = { + query(sql: string): SqlResult { + sqls.push(sql); + return { columns: [], rows: [] }; + }, + }; + const df = DataFrame.fromColumns({ x: [null] }); + toSql(df, "t", queryConn, { index: false }); + expect(sqls.some((s) => s.includes("NULL"))).toBe(true); + }); +}); + +// ─── round-trip ─────────────────────────────────────────────────────────────── + +describe("toSql / readSqlTable — round-trip", () => { + it("numeric data survives a round-trip", () => { + const db = new MockAdapter(); + const original = DataFrame.fromColumns({ + a: [1, 2, 3], + b: [0.1, 0.2, 0.3], + }); + toSql(original, "data", db, { index: false }); + const restored = readSqlTable("data", db); + expect(restored.shape).toEqual([3, 2]); + expect([...restored.col("a").values]).toEqual([1, 2, 3]); + expect([...restored.col("b").values]).toEqual([0.1, 0.2, 0.3]); + }); + + it("string data survives a round-trip", () => { + const db = new MockAdapter(); + const original = DataFrame.fromColumns({ name: ["Alice", "Bob"] }); + toSql(original, "names", db, { index: false }); + const restored = readSqlTable("names", db); + expect([...restored.col("name").values]).toEqual(["Alice", "Bob"]); + }); + + it("boolean data survives a round-trip via fallback path", () => { + const rows: SqlRow[] = []; + let dropCalled = false; + const fakeConn: SqlConnection = { + query(sql: string): SqlResult { + if (/^DROP/i.test(sql)) { + dropCalled = true; + rows.length = 0; + return { columns: [], rows: [] }; + } + if (/^INSERT/i.test(sql)) { + // Parse the boolean-like values out for assertion + rows.push({ _sql: sql }); + return { columns: [], rows: [] }; + } + return { columns: ["flag"], rows }; + }, + }; + const df = DataFrame.fromColumns({ flag: [true, false] }); + toSql(df, "t", fakeConn, { index: false, ifExists: "replace" }); + expect(dropCalled).toBe(true); + expect(rows).toHaveLength(2); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("readSqlQuery — property tests", () => { + it("shape matches result column/row counts", () => { + fc.assert( + fc.property( + fc.array(fc.string({ minLength: 1, maxLength: 10 }), { + minLength: 1, + maxLength: 5, + }), + fc.integer({ min: 0, max: 20 }), + (cols, rowCount) => { + const uniqueCols = [...new Set(cols)]; + if (uniqueCols.length === 0) return; + const rows: SqlRow[] = Array.from({ length: rowCount }, () => { + const row: SqlRow = {}; + for (const c of uniqueCols) { + row[c] = 42; + } + return row; + }); + const result: SqlResult = { columns: uniqueCols, rows }; + const conn: SqlConnection = { query: () => result }; + const df = readSqlQuery("SELECT 1", conn); + expect(df.shape).toEqual([rowCount, uniqueCols.length]); + }, + ), + ); + }); +}); + +describe("toSql — property tests", () => { + it("round-trip preserves number of rows (adapter path)", () => { + fc.assert( + fc.property( + fc.array(fc.double({ noNaN: true, noDefaultInfinity: true }), { + minLength: 0, + maxLength: 30, + }), + (vals) => { + const db = new MockAdapter(); + const df = DataFrame.fromColumns({ v: vals }); + const written = toSql(df, "tbl", db, { index: false }); + expect(written).toBe(vals.length); + const back = readSqlTable("tbl", db); + expect(back.shape[0]).toBe(vals.length); + }, + ), + ); + }); +}); diff --git a/tests/io/stata.test.ts b/tests/io/stata.test.ts new file mode 100644 index 00000000..11ae394c --- /dev/null +++ b/tests/io/stata.test.ts @@ -0,0 +1,364 @@ +/** + * Tests for src/io/stata.ts — readStata() and toStata(). + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, readStata, toStata } from "../../src/index.ts"; + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/** Write then read back the DataFrame, returning the round-trip copy. */ +function roundTrip(df: DataFrame): DataFrame { + const buf = toStata(df); + return readStata(buf); +} + +// ─── toStata: output shape ──────────────────────────────────────────────────── + +describe("toStata — output format", () => { + it("returns a non-empty Uint8Array", () => { + const df = DataFrame.fromColumns({ x: [1, 2, 3] }); + const buf = toStata(df); + expect(buf).toBeInstanceOf(Uint8Array); + expect(buf.length).toBeGreaterThan(0); + }); + + it("starts with ", () => { + const df = DataFrame.fromColumns({ x: [1] }); + const buf = toStata(df); + const header = new TextDecoder().decode(buf.subarray(0, 11)); + expect(header).toBe(""); + }); + + it("contains 118", () => { + const df = DataFrame.fromColumns({ a: [1, 2] }); + const text = new TextDecoder("latin1").decode(toStata(df).subarray(0, 200)); + expect(text).toContain("118"); + }); + + it("contains little-endian byteorder marker", () => { + const df = DataFrame.fromColumns({ a: [1] }); + const text = new TextDecoder("latin1").decode(toStata(df).subarray(0, 300)); + expect(text).toContain("LSF"); + }); +}); + +// ─── Round-trip: numeric columns ───────────────────────────────────────────── + +describe("readStata ∘ toStata — numeric round-trip", () => { + it("round-trips integer-like values as doubles", () => { + const df = DataFrame.fromColumns({ a: [1, 2, 3], b: [10, 20, 30] }); + const rt = roundTrip(df); + expect(rt.shape).toEqual([3, 2]); + expect([...rt.columns.values]).toEqual(["a", "b"]); + expect([...rt.col("a").values]).toEqual([1, 2, 3]); + expect([...rt.col("b").values]).toEqual([10, 20, 30]); + }); + + it("round-trips floating-point values", () => { + const df = DataFrame.fromColumns({ x: [1.5, 2.75, -0.125] }); + const rt = roundTrip(df); + const vals = [...rt.col("x").values] as number[]; + expect(vals[0]).toBeCloseTo(1.5); + expect(vals[1]).toBeCloseTo(2.75); + expect(vals[2]).toBeCloseTo(-0.125); + }); + + it("round-trips negative integers", () => { + const df = DataFrame.fromColumns({ v: [-100, 0, 100] }); + const rt = roundTrip(df); + expect([...rt.col("v").values]).toEqual([-100, 0, 100]); + }); +}); + +// ─── Round-trip: null / missing values ─────────────────────────────────────── + +describe("readStata ∘ toStata — null / missing values", () => { + it("round-trips null in a numeric column", () => { + const df = DataFrame.fromColumns({ a: [1, null, 3] }); + const rt = roundTrip(df); + expect([...rt.col("a").values]).toEqual([1, null, 3]); + }); + + it("round-trips all-null column", () => { + const df = DataFrame.fromColumns({ a: [null, null] }); + const rt = roundTrip(df); + expect([...rt.col("a").values]).toEqual([null, null]); + }); + + it("round-trips null in a string column", () => { + const df = DataFrame.fromColumns({ s: ["hello", null, "world"] }); + const rt = roundTrip(df); + // null strings come back as empty strings after trimming null bytes + const vals = [...rt.col("s").values] as string[]; + expect(vals[0]).toBe("hello"); + expect(vals[2]).toBe("world"); + }); +}); + +// ─── Round-trip: string columns ────────────────────────────────────────────── + +describe("readStata ∘ toStata — string columns", () => { + it("round-trips short ASCII strings", () => { + const df = DataFrame.fromColumns({ name: ["Alice", "Bob", "Carol"] }); + const rt = roundTrip(df); + expect([...rt.col("name").values]).toEqual(["Alice", "Bob", "Carol"]); + }); + + it("round-trips empty strings", () => { + const df = DataFrame.fromColumns({ s: ["", "a", ""] }); + const rt = roundTrip(df); + const vals = [...rt.col("s").values]; + expect(vals[1]).toBe("a"); + }); + + it("round-trips a string that is exactly 2045 bytes", () => { + const long = "x".repeat(2045); + const df = DataFrame.fromColumns({ s: [long] }); + const rt = roundTrip(df); + expect(([...rt.col("s").values][0] as string).length).toBe(2045); + }); + + it("truncates strings longer than 2045 bytes", () => { + const long = "y".repeat(3000); + const df = DataFrame.fromColumns({ s: [long] }); + const rt = roundTrip(df); + expect(([...rt.col("s").values][0] as string).length).toBe(2045); + }); +}); + +// ─── Round-trip: boolean columns ───────────────────────────────────────────── + +describe("readStata ∘ toStata — boolean columns", () => { + it("round-trips booleans as 0/1 bytes", () => { + const df = DataFrame.fromColumns({ flag: [true, false, true] }); + const rt = roundTrip(df); + const vals = [...rt.col("flag").values] as number[]; + expect(vals[0]).toBe(1); + expect(vals[1]).toBe(0); + expect(vals[2]).toBe(1); + }); +}); + +// ─── Round-trip: multi-column ───────────────────────────────────────────────── + +describe("readStata ∘ toStata — multi-column", () => { + it("preserves column order", () => { + const df = DataFrame.fromColumns({ z: [3], a: [1], m: [2] }); + const rt = roundTrip(df); + expect([...rt.columns.values]).toEqual(["z", "a", "m"]); + }); + + it("preserves values across mixed-type columns", () => { + const df = DataFrame.fromColumns({ + id: [1, 2, 3], + name: ["x", "y", "z"], + score: [9.5, null, 7.0], + }); + const rt = roundTrip(df); + expect(rt.shape).toEqual([3, 3]); + expect([...rt.col("id").values]).toEqual([1, 2, 3]); + expect([...rt.col("name").values]).toEqual(["x", "y", "z"]); + const scores = [...rt.col("score").values] as (number | null)[]; + expect(scores[0]).toBeCloseTo(9.5); + expect(scores[1]).toBeNull(); + expect(scores[2]).toBeCloseTo(7.0); + }); +}); + +// ─── readStata options ─────────────────────────────────────────────────────── + +describe("readStata — options", () => { + it("nRows limits the number of rows returned", () => { + const df = DataFrame.fromColumns({ v: [1, 2, 3, 4, 5] }); + const buf = toStata(df); + const rt = readStata(buf, { nRows: 2 }); + expect(rt.shape[0]).toBe(2); + expect([...rt.col("v").values]).toEqual([1, 2]); + }); + + it("nRows = 0 returns empty DataFrame", () => { + const df = DataFrame.fromColumns({ v: [1, 2, 3] }); + const rt = readStata(toStata(df), { nRows: 0 }); + expect(rt.shape[0]).toBe(0); + }); + + it("usecols filters to named columns only", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4], c: [5, 6] }); + const rt = readStata(toStata(df), { usecols: ["a", "c"] }); + expect([...rt.columns.values]).toEqual(["a", "c"]); + expect([...rt.col("a").values]).toEqual([1, 2]); + expect([...rt.col("c").values]).toEqual([5, 6]); + }); + + it("usecols: empty array returns no columns", () => { + const df = DataFrame.fromColumns({ a: [1], b: [2] }); + const rt = readStata(toStata(df), { usecols: [] }); + expect(rt.shape[1]).toBe(0); + }); + + it("indexCol by name sets the row index", () => { + const df = DataFrame.fromColumns({ id: [10, 20, 30], val: [1, 2, 3] }); + const rt = readStata(toStata(df), { indexCol: "id" }); + expect([...rt.index.toArray()]).toEqual([10, 20, 30]); + expect([...rt.columns.values]).toEqual(["val"]); + }); +}); + +// ─── toStata options ────────────────────────────────────────────────────────── + +describe("toStata — options", () => { + it("writeIndex=true adds _index column", () => { + const df = DataFrame.fromColumns({ v: [10, 20] }); + const rt = readStata(toStata(df, { writeIndex: true })); + expect([...rt.columns.values]).toContain("_index"); + }); + + it("dataLabel is embedded in the file (new format has length prefix)", () => { + const df = DataFrame.fromColumns({ x: [1] }); + const buf = toStata(df, { dataLabel: "My Dataset" }); + const text = new TextDecoder("latin1").decode(buf); + expect(text).toContain("My Dataset"); + }); + + it("variableLabels are embedded for each named column", () => { + const df = DataFrame.fromColumns({ age: [25] }); + const buf = toStata(df, { variableLabels: { age: "Age in years" } }); + const text = new TextDecoder("latin1").decode(buf); + expect(text).toContain("Age in years"); + }); +}); + +// ─── readStata: error handling ──────────────────────────────────────────────── + +describe("readStata — error handling", () => { + it("throws on empty buffer", () => { + expect(() => readStata(new Uint8Array(0))).toThrow(); + }); + + it("throws on a 3-byte buffer", () => { + expect(() => readStata(new Uint8Array([0, 1, 2]))).toThrow(); + }); + + it("throws on unknown old-format version byte", () => { + const bad = new Uint8Array(200); + bad[0] = 50; // version 50 is not a valid Stata version + expect(() => readStata(bad)).toThrow(); + }); +}); + +// ─── Empty DataFrame ────────────────────────────────────────────────────────── + +describe("readStata ∘ toStata — edge cases", () => { + it("round-trips a single cell", () => { + const df = DataFrame.fromColumns({ x: [42] }); + const rt = roundTrip(df); + expect(rt.shape).toEqual([1, 1]); + expect([...rt.col("x").values]).toEqual([42]); + }); + + it("round-trips a zero-row DataFrame", () => { + const df = DataFrame.fromColumns({ a: [] as number[] }); + const rt = roundTrip(df); + expect(rt.shape[0]).toBe(0); + }); + + it("handles column names up to 32 chars (Stata limit)", () => { + const longName = "a".repeat(32); + const df = DataFrame.fromColumns({ [longName]: [1, 2] }); + const rt = roundTrip(df); + expect([...rt.columns.values][0]).toBe(longName); + }); + + it("column names longer than 32 chars are truncated to 32", () => { + const longName = "b".repeat(40); + const df = DataFrame.fromColumns({ [longName]: [1] }); + const rt = roundTrip(df); + const rtName = ([...rt.columns.values][0] as string) ?? ""; + expect(rtName.length).toBe(32); + }); +}); + +// ─── Property-based tests ───────────────────────────────────────────────────── + +describe("readStata ∘ toStata — property-based", () => { + it("round-trip preserves shape [rows × 1 numeric column]", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.float({ noNaN: true }), { nil: null }), { + minLength: 0, + maxLength: 50, + }), + (vals) => { + const df = DataFrame.fromColumns({ v: vals }); + const rt = roundTrip(df); + expect(rt.shape[0]).toBe(vals.length); + expect(rt.shape[1]).toBe(1); + }, + ), + ); + }); + + it("round-trip preserves non-null finite doubles", () => { + // Stata stores doubles with |value| < 2^1023 as non-missing. + // Values >= 2^1023 share the Stata missing-value bit pattern and round-trip to null. + const stataDoubleRange = fc + .double({ noNaN: true, noDefaultInfinity: true }) + .filter((n) => Math.abs(n) < 2 ** 1023); + fc.assert( + fc.property( + fc.array(stataDoubleRange, { + minLength: 1, + maxLength: 30, + }), + (nums) => { + const df = DataFrame.fromColumns({ v: nums }); + const rt = roundTrip(df); + const out = [...rt.col("v").values] as number[]; + for (let i = 0; i < nums.length; i++) { + const n = nums[i]; + const o = out[i]; + if (n === undefined || o === undefined) continue; + expect(o).toBeCloseTo(n, 10); + } + }, + ), + ); + }); + + it("round-trip preserves null pattern in numeric column", () => { + fc.assert( + fc.property( + fc.array(fc.option(fc.integer({ min: -1000, max: 1000 }), { nil: null }), { + minLength: 0, + maxLength: 40, + }), + (vals) => { + const df = DataFrame.fromColumns({ v: vals }); + const rt = roundTrip(df); + const out = [...rt.col("v").values]; + const inNulls = vals.map((v) => v === null); + const outNulls = out.map((v) => v === null); + expect(outNulls).toEqual(inNulls); + }, + ), + ); + }); + + it("nRows clamps output row count to min(nRows, available)", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -1000, max: 1000 }), { + minLength: 0, + maxLength: 50, + }), + fc.nat(60), + (vals, nRows) => { + const df = DataFrame.fromColumns({ v: vals }); + const rt = readStata(toStata(df), { nRows }); + expect(rt.shape[0]).toBe(Math.min(nRows, vals.length)); + }, + ), + ); + }); +}); diff --git a/tests/io/xml.test.ts b/tests/io/xml.test.ts new file mode 100644 index 00000000..0775d398 --- /dev/null +++ b/tests/io/xml.test.ts @@ -0,0 +1,370 @@ +/** + * Tests for readXml / toXml — XML I/O for DataFrame. + */ + +import { describe, expect, test } from "bun:test"; +import fc from "fast-check"; +import { DataFrame } from "../../src/index.ts"; +import { readXml, toXml } from "../../src/index.ts"; + +// ─── basic readXml ──────────────────────────────────────────────────────────── + +describe("readXml — basic parsing", () => { + test("parses child-element rows", () => { + const xml = ` + + Alice30 + Bob25 +`; + const df = readXml(xml); + expect(df.shape).toEqual([2, 2]); + expect(df.columns.toArray()).toEqual(["name", "age"]); + expect(df.col("name").toArray()).toEqual(["Alice", "Bob"]); + expect(df.col("age").toArray()).toEqual([30, 25]); + }); + + test("parses attribute rows", () => { + const xml = ` + + +`; + const df = readXml(xml); + expect(df.shape).toEqual([2, 2]); + expect(df.col("id").toArray()).toEqual([1, 2]); + expect(df.col("name").toArray()).toEqual(["Alice", "Bob"]); + }); + + test("mixes attributes and child elements", () => { + const xml = ` + + +`; + const df = readXml(xml, { rowTag: "item" }); + expect(df.shape).toEqual([2, 2]); + expect(df.col("id").toArray()).toEqual([1, 2]); + expect(df.col("label").toArray()).toEqual(["foo", "bar"]); + }); + + test("auto-detects rowTag", () => { + const xml = ` + 1 + 2 + 3 +`; + const df = readXml(xml); + expect(df.shape[0]).toBe(3); + expect(df.col("x").toArray()).toEqual([1, 2, 3]); + }); + + test("handles empty XML gracefully", () => { + const df = readXml(""); + expect(df.shape).toEqual([0, 0]); + }); + + test("returns empty DataFrame for no matching rows", () => { + const xml = "x"; + const df = readXml(xml, { rowTag: "row" }); + expect(df.shape).toEqual([0, 0]); + }); +}); + +// ─── options ────────────────────────────────────────────────────────────────── + +describe("readXml — options", () => { + const xml = ` + 1hello3.14 + 2world2.71 + 3foo1.41 +`; + + test("usecols filters columns", () => { + const df = readXml(xml, { usecols: ["a", "c"] }); + expect(df.columns.toArray()).toEqual(["a", "c"]); + expect(df.shape[1]).toBe(2); + }); + + test("nrows limits rows", () => { + const df = readXml(xml, { nrows: 2 }); + expect(df.shape[0]).toBe(2); + }); + + test("converters=false keeps strings", () => { + const df = readXml(xml, { converters: false }); + expect(df.col("a").toArray()).toEqual(["1", "2", "3"]); + }); + + test("naValues marks as null", () => { + const xml2 = ` + 1 + MISSING + 3 +`; + const df = readXml(xml2, { naValues: ["MISSING"] }); + expect(df.col("x").toArray()).toEqual([1, null, 3]); + }); + + test("indexCol by name", () => { + const df = readXml(xml, { indexCol: "a" }); + expect(df.columns.toArray()).toEqual(["b", "c"]); + expect(df.index.toArray()).toEqual([1, 2, 3]); + }); + + test("indexCol by number", () => { + const df = readXml(xml, { indexCol: 0 }); + expect(df.columns.toArray()).toEqual(["b", "c"]); + expect(df.index.toArray()).toEqual([1, 2, 3]); + }); + + test("attribs=false ignores attributes", () => { + const xml2 = ` + Alice + Bob +`; + const df = readXml(xml2, { attribs: false }); + expect(df.columns.toArray()).toEqual(["name"]); + }); + + test("elems=false ignores child elements", () => { + const xml2 = ` + Alice + Bob +`; + const df = readXml(xml2, { elems: false }); + expect(df.columns.toArray()).toEqual(["id"]); + }); +}); + +// ─── entity + CDATA handling ────────────────────────────────────────────────── + +describe("readXml — entities and CDATA", () => { + test("decodes named entities", () => { + const xml = "a & b < c"; + const df = readXml(xml, { converters: false }); + expect(df.col("v").at(0)).toBe("a & b < c"); + }); + + test("decodes numeric entities", () => { + const xml = "AB"; + const df = readXml(xml, { converters: false }); + expect(df.col("v").at(0)).toBe("AB"); + }); + + test("CDATA section text is read as-is", () => { + const xml = "]]>"; + const df = readXml(xml, { converters: false }); + expect(df.col("v").at(0)).toBe("hello & "); + }); + + test("comments are ignored", () => { + const xml = ` + + 1 + + 2 +`; + const df = readXml(xml); + expect(df.shape[0]).toBe(2); + }); +}); + +// ─── namespace handling ─────────────────────────────────────────────────────── + +describe("readXml — namespaces", () => { + test("strips namespace prefixes from element names", () => { + const xml = ` + Alice +`; + const df = readXml(xml, { rowTag: "row" }); + expect(df.columns.toArray()).toEqual(["name"]); + expect(df.col("name").at(0)).toBe("Alice"); + }); + + test("strips namespace prefixes from attribute names", () => { + const xml = ` + +`; + const df = readXml(xml); + expect(df.columns.toArray()).toContain("id"); + expect(df.columns.toArray()).toContain("val"); + }); +}); + +// ─── default NA values ──────────────────────────────────────────────────────── + +describe("readXml — built-in NA values", () => { + test("empty string becomes null", () => { + const xml = ""; + const df = readXml(xml); + expect(df.col("x").at(0)).toBeNull(); + }); + + test("NA string becomes null", () => { + const xml = "NA"; + const df = readXml(xml); + expect(df.col("x").at(0)).toBeNull(); + }); + + test("NaN string becomes null", () => { + const xml = "NaN"; + const df = readXml(xml); + expect(df.col("x").at(0)).toBeNull(); + }); +}); + +// ─── toXml basic ───────────────────────────────────────────────────────────── + +describe("toXml — basic serialization", () => { + test("produces valid XML with child elements by default", () => { + const df = DataFrame.fromColumns({ name: ["Alice", "Bob"], age: [30, 25] }); + const xml = toXml(df); + expect(xml).toContain(""); + expect(xml).toContain(""); + expect(xml).toContain("Alice"); + expect(xml).toContain("30"); + expect(xml).toContain(""); + }); + + test("custom root and row names", () => { + const df = DataFrame.fromColumns({ x: [1, 2] }); + const xml = toXml(df, { rootName: "records", rowName: "record" }); + expect(xml).toContain(""); + expect(xml).toContain(""); + expect(xml).toContain(""); + }); + + test("attribs mode emits attributes", () => { + const df = DataFrame.fromColumns({ id: [1, 2], name: ["Alice", "Bob"] }); + const xml = toXml(df, { attribs: true }); + expect(xml).toContain('id="1"'); + expect(xml).toContain('name="Alice"'); + }); + + test("xmlDeclaration=false omits PI", () => { + const df = DataFrame.fromColumns({ x: [1] }); + const xml = toXml(df, { xmlDeclaration: false }); + expect(xml).not.toContain(""); + }); + + test("namespaces are declared on root", () => { + const df = DataFrame.fromColumns({ x: [1] }); + const xml = toXml(df, { namespaces: { xsi: "http://www.w3.org/2001/XMLSchema-instance" } }); + expect(xml).toContain('xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'); + }); + + test("indent=null produces compact output", () => { + const df = DataFrame.fromColumns({ x: [1] }); + const xml = toXml(df, { indent: null }); + expect(xml).not.toContain(" "); // no leading spaces + }); + + test("cdataCols wraps in CDATA", () => { + const df = DataFrame.fromColumns({ html: ["bold"] }); + const xml = toXml(df, { cdataCols: ["html"] }); + expect(xml).toContain("bold]]>"); + }); + + test("encodes entities in non-CDATA columns", () => { + const df = DataFrame.fromColumns({ v: ["a & b"] }); + const xml = toXml(df, { cdataCols: [] }); + expect(xml).toContain("a & b"); + }); + + test("empty DataFrame produces root with no rows", () => { + const df = DataFrame.fromColumns({}); + const xml = toXml(df); + expect(xml).toContain(""); + expect(xml).toContain(""); + expect(xml).not.toContain(""); + }); +}); + +// ─── round-trip ─────────────────────────────────────────────────────────────── + +describe("toXml / readXml round-trip", () => { + test("round-trips string columns", () => { + const df = DataFrame.fromColumns({ + name: ["Alice", "Bob", "Carol"], + city: ["NYC", "LA", "Chicago"], + }); + const xml = toXml(df, { xmlDeclaration: false }); + const df2 = readXml(xml, { converters: false }); + expect(df2.shape).toEqual(df.shape); + expect(df2.col("name").toArray()).toEqual(["Alice", "Bob", "Carol"]); + expect(df2.col("city").toArray()).toEqual(["NYC", "LA", "Chicago"]); + }); + + test("round-trips numeric columns", () => { + const df = DataFrame.fromColumns({ x: [1, 2, 3], y: [4.5, 5.6, 6.7] }); + const xml = toXml(df); + const df2 = readXml(xml); + expect(df2.col("x").toArray()).toEqual([1, 2, 3]); + expect(df2.col("y").toArray()).toEqual([4.5, 5.6, 6.7]); + }); + + test("round-trips attribs mode", () => { + const df = DataFrame.fromColumns({ id: [1, 2], name: ["Alice", "Bob"] }); + const xml = toXml(df, { attribs: true }); + const df2 = readXml(xml); + expect(df2.shape).toEqual(df.shape); + expect(df2.col("id").toArray()).toEqual([1, 2]); + expect(df2.col("name").toArray()).toEqual(["Alice", "Bob"]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("readXml / toXml — property tests", () => { + const safeStr = fc + .stringMatching(/^[A-Za-z0-9 _-]*$/) + .filter((s) => s.length > 0 && !["NA", "NaN", "N/A", "null", "None", "nan"].includes(s)); + + test("round-trip: toXml then readXml preserves shape", () => { + fc.assert( + fc.property( + fc.array(safeStr, { minLength: 1, maxLength: 4 }), + fc.integer({ min: 1, max: 5 }), + (colNames, nRows) => { + const uniqueCols = [...new Set(colNames)]; + const colData: Record = {}; + for (const c of uniqueCols) { + colData[c] = Array.from({ length: nRows }, (_, i) => `v${i}`); + } + const df = DataFrame.fromColumns(colData); + const xml = toXml(df); + const df2 = readXml(xml, { converters: false }); + return df2.shape[0] === nRows && df2.shape[1] === uniqueCols.length; + }, + ), + { numRuns: 50 }, + ); + }); + + test("toXml produces valid XML structure", () => { + fc.assert( + fc.property(fc.integer({ min: 0, max: 10 }), (nRows) => { + const df = DataFrame.fromColumns({ x: Array.from({ length: nRows }, (_, i) => i) }); + const xml = toXml(df); + return xml.includes("") && xml.includes(""); + }), + { numRuns: 50 }, + ); + }); + + test("nrows limits output correctly", () => { + fc.assert( + fc.property( + fc.integer({ min: 1, max: 10 }), + fc.integer({ min: 1, max: 10 }), + (total, limit) => { + const df = DataFrame.fromColumns({ x: Array.from({ length: total }, (_, i) => i) }); + const xml = toXml(df); + const df2 = readXml(xml, { nrows: limit }); + return df2.shape[0] === Math.min(total, limit); + }, + ), + { numRuns: 50 }, + ); + }); +}); diff --git a/tests/reshape/lreshape.test.ts b/tests/reshape/lreshape.test.ts new file mode 100644 index 00000000..5605abce --- /dev/null +++ b/tests/reshape/lreshape.test.ts @@ -0,0 +1,254 @@ +/** + * Tests for src/reshape/lreshape.ts — lreshape (wide → long with named groups). + */ + +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { DataFrame, type Scalar, lreshape } from "../../src/index.ts"; + +// ─── helpers ────────────────────────────────────────────────────────────────── + +function colValues(df: DataFrame, col: string): Scalar[] { + return [...df.col(col).values]; +} + +// ─── basic lreshape ─────────────────────────────────────────────────────────── + +describe("lreshape", () => { + describe("basic usage", () => { + it("reshapes a single group of two columns", () => { + const df = DataFrame.fromColumns({ + id: ["a", "b"], + v1: [1, 2], + v2: [3, 4], + }); + const result = lreshape(df, { v: ["v1", "v2"] }); + // 2 rows × 2 block positions = 4 output rows + expect(result.shape[0]).toBe(4); + expect(result.columns.values).toEqual(["id", "v"]); + // Block 0: v1 values, Block 1: v2 values + expect(colValues(result, "id")).toEqual(["a", "b", "a", "b"]); + expect(colValues(result, "v")).toEqual([1, 2, 3, 4]); + }); + + it("reshapes multiple groups simultaneously", () => { + const df = DataFrame.fromColumns({ + hr: [14, 7], + team: ["Red", "Blue"], + v1: [1, 3], + v2: [2, 4], + w1: [10, 30], + w2: [20, 40], + }); + const result = lreshape(df, { v: ["v1", "v2"], w: ["w1", "w2"] }); + expect(result.shape[0]).toBe(4); + expect(result.columns.values).toEqual(["hr", "team", "v", "w"]); + expect(colValues(result, "v")).toEqual([1, 3, 2, 4]); + expect(colValues(result, "w")).toEqual([10, 30, 20, 40]); + }); + + it("preserves id columns repeated per block", () => { + const df = DataFrame.fromColumns({ + id: [1, 2, 3], + x1: [10, 20, 30], + x2: [40, 50, 60], + }); + const result = lreshape(df, { x: ["x1", "x2"] }); + expect(result.shape[0]).toBe(6); + expect(colValues(result, "id")).toEqual([1, 2, 3, 1, 2, 3]); + expect(colValues(result, "x")).toEqual([10, 20, 30, 40, 50, 60]); + }); + + it("works with a single row", () => { + const df = DataFrame.fromColumns({ + a: [5], + b1: [1], + b2: [2], + b3: [3], + }); + const result = lreshape(df, { b: ["b1", "b2", "b3"] }); + expect(result.shape[0]).toBe(3); + expect(colValues(result, "a")).toEqual([5, 5, 5]); + expect(colValues(result, "b")).toEqual([1, 2, 3]); + }); + + it("works with no id columns (all columns in groups)", () => { + const df = DataFrame.fromColumns({ + x1: [1, 2], + x2: [3, 4], + }); + const result = lreshape(df, { x: ["x1", "x2"] }); + expect(result.shape[0]).toBe(4); + expect(result.columns.values).toEqual(["x"]); + expect(colValues(result, "x")).toEqual([1, 2, 3, 4]); + }); + }); + + describe("dropna behaviour", () => { + it("drops rows where any value column is null by default", () => { + const df = DataFrame.fromColumns({ + id: [1, 2, 3], + v1: [1, null, 3], + v2: [4, 5, 6], + }); + const result = lreshape(df, { v: ["v1", "v2"] }); + // Row with id=2 in block 0 (v1=null) is dropped; all block-1 rows kept + expect(result.shape[0]).toBe(5); + const ids = colValues(result, "id"); + expect(ids).not.toContain(null); + // id=2 is still present in block 1 (v2=5) + expect(ids).toContain(2); + }); + + it("keeps null rows when dropna=false", () => { + const df = DataFrame.fromColumns({ + id: [1, 2], + v1: [1, null], + v2: [3, 4], + }); + const result = lreshape(df, { v: ["v1", "v2"] }, { dropna: false }); + expect(result.shape[0]).toBe(4); + expect(colValues(result, "v")).toEqual([1, null, 3, 4]); + }); + + it("drops rows where NaN appears in value column", () => { + const df = DataFrame.fromColumns({ + id: [1, 2], + v1: [1, Number.NaN], + v2: [3, 4], + }); + // block 0, row 1 → v1=NaN → dropped; block 1, row 1 → v2=4 → kept + const result = lreshape(df, { v: ["v1", "v2"] }); + expect(result.shape[0]).toBe(3); + }); + }); + + describe("edge cases", () => { + it("returns empty DataFrame for empty source", () => { + const df = DataFrame.fromColumns({ + id: [] as Scalar[], + v1: [] as Scalar[], + v2: [] as Scalar[], + }); + const result = lreshape(df, { v: ["v1", "v2"] }); + expect(result.shape[0]).toBe(0); + expect(result.columns.values).toEqual(["id", "v"]); + }); + + it("returns source DataFrame when groups is empty", () => { + const df = DataFrame.fromColumns({ a: [1, 2], b: [3, 4] }); + const result = lreshape(df, {}); + expect(result.shape[0]).toBe(2); + }); + + it("throws when group lists have different lengths", () => { + const df = DataFrame.fromColumns({ + v1: [1, 2], + v2: [3, 4], + w1: [5, 6], + }); + expect(() => lreshape(df, { v: ["v1", "v2"], w: ["w1"] })).toThrow(/same length/); + }); + + it("throws when a referenced column does not exist", () => { + const df = DataFrame.fromColumns({ a: [1, 2] }); + expect(() => lreshape(df, { x: ["a", "MISSING"] })).toThrow(/not found/); + }); + + it("result always has a RangeIndex", () => { + const df = DataFrame.fromColumns({ id: [1, 2], v1: [10, 20], v2: [30, 40] }); + const result = lreshape(df, { v: ["v1", "v2"] }); + const idxVals = [...result.index.values]; + expect(idxVals).toEqual([0, 1, 2, 3]); + }); + + it("handles string values in value columns", () => { + const df = DataFrame.fromColumns({ + id: [1, 2], + a1: ["x", "y"], + a2: ["p", "q"], + }); + const result = lreshape(df, { a: ["a1", "a2"] }); + expect(colValues(result, "a")).toEqual(["x", "y", "p", "q"]); + }); + + it("handles three-group reshape correctly", () => { + const df = DataFrame.fromColumns({ + name: ["Alice", "Bob"], + score1: [80, 70], + score2: [85, 75], + score3: [90, 80], + }); + const result = lreshape(df, { score: ["score1", "score2", "score3"] }); + expect(result.shape[0]).toBe(6); + expect(colValues(result, "score")).toEqual([80, 70, 85, 75, 90, 80]); + expect(colValues(result, "name")).toEqual(["Alice", "Bob", "Alice", "Bob", "Alice", "Bob"]); + }); + }); + + describe("property-based tests", () => { + it("output row count equals nRows * k (when dropna=false)", () => { + fc.assert( + fc.property( + // Generate a small DataFrame with 1-4 id cols and 2-4 value cols + fc + .nat({ max: 4 }) + .chain((nId) => + fc.nat({ max: 3 }).chain((k) => + fc.integer({ min: 1, max: 8 }).map((nRows) => { + const data: Record = {}; + for (let i = 0; i < nId; i++) { + data[`id${i}`] = Array.from({ length: nRows }, (_, j) => j + i); + } + for (let vi = 0; vi < k + 1; vi++) { + data[`v${vi}`] = Array.from({ length: nRows }, (_, j) => j * 10 + vi); + } + return { data, nId, k: k + 1, nRows }; + }), + ), + ), + ({ data, nId, k, nRows }) => { + const df = DataFrame.fromColumns(data); + const groups: Record = { v: [] }; + for (let vi = 0; vi < k; vi++) { + (groups["v"] as string[]).push(`v${vi}`); + } + const result = lreshape(df, groups, { dropna: false }); + expect(result.shape[0]).toBe(nRows * k); + }, + ), + { numRuns: 50 }, + ); + }); + + it("id column values are repeated k times each row (dropna=false)", () => { + fc.assert( + fc.property( + fc + .integer({ min: 1, max: 5 }) + .chain((nRows) => fc.integer({ min: 2, max: 4 }).map((k) => ({ nRows, k }))), + ({ nRows, k }) => { + const ids = Array.from({ length: nRows }, (_, i) => i + 1); + const data: Record = { id: ids }; + for (let vi = 0; vi < k; vi++) { + data[`v${vi}`] = Array.from({ length: nRows }, (_, j) => j * k + vi); + } + const groups: Record = { v: [] }; + for (let vi = 0; vi < k; vi++) { + (groups["v"] as string[]).push(`v${vi}`); + } + const df = DataFrame.fromColumns(data); + const result = lreshape(df, groups, { dropna: false }); + const outIds = colValues(result, "id"); + // Each original id appears exactly k times + for (const id of ids) { + const count = outIds.filter((v) => v === id).length; + expect(count).toBe(k); + } + }, + ), + { numRuns: 50 }, + ); + }); + }); +}); diff --git a/tests/stats/case_when.test.ts b/tests/stats/case_when.test.ts new file mode 100644 index 00000000..387495b2 --- /dev/null +++ b/tests/stats/case_when.test.ts @@ -0,0 +1,316 @@ +/** + * Tests for src/stats/case_when.ts + * Covers caseWhen — conditional value selection using CASE WHEN semantics. + */ +import { describe, expect, it } from "bun:test"; +import fc from "fast-check"; +import { Series, caseWhen } from "../../src/index.ts"; +import type { Scalar } from "../../src/index.ts"; + +// ─── helpers ───────────────────────────────────────────────────────────────── + +function s(data: readonly Scalar[]): Series { + return new Series({ data: [...data] }); +} + +function boolS(data: readonly boolean[]): Series { + return new Series({ data: [...data] }); +} + +// ─── basic functionality ────────────────────────────────────────────────────── + +describe("caseWhen — basic", () => { + it("empty caselist returns copy of original", () => { + const ser = s([1, 2, 3]); + const res = caseWhen(ser, []); + expect(res.toArray()).toEqual([1, 2, 3]); + }); + + it("single branch — scalar replacement", () => { + const ser = s([1, 2, 3, 4]); + const cond = boolS([true, false, true, false]); + const res = caseWhen(ser, [[cond, 99]]); + expect(res.toArray()).toEqual([99, 2, 99, 4]); + }); + + it("single branch — Series replacement", () => { + const ser = s([1, 2, 3]); + const cond = boolS([true, false, true]); + const repl = s([10, 20, 30]); + const res = caseWhen(ser, [[cond, repl]]); + expect(res.toArray()).toEqual([10, 2, 30]); + }); + + it("single branch — array replacement", () => { + const ser = s([1, 2, 3]); + const cond = boolS([false, true, true]); + const res = caseWhen(ser, [[cond, [100, 200, 300]]]); + expect(res.toArray()).toEqual([1, 200, 300]); + }); + + it("first matching condition wins", () => { + const ser = s([1, 2, 3, 4, 5]); + const lt3 = boolS([true, true, false, false, false]); + const lt5 = boolS([true, true, true, true, false]); + const res = caseWhen(ser, [ + [lt3, "small"], + [lt5, "medium"], + ]); + expect(res.toArray()).toEqual(["small", "small", "medium", "medium", 5]); + }); + + it("grade classification — pandas docs example style", () => { + const score = new Series({ data: [45, 72, 88, 95, 60] }); + const d = score.toArray(); + const ge90 = boolS(d.map((v) => v >= 90)); + const ge75 = boolS(d.map((v) => v >= 75)); + const ge60 = boolS(d.map((v) => v >= 60)); + const ge45 = boolS(d.map((v) => v >= 45)); + const grade = caseWhen(score, [ + [ge90, "A"], + [ge75, "B"], + [ge60, "C"], + [ge45, "D"], + ]); + expect(grade.toArray()).toEqual(["D", "C", "B", "A", "C"]); + }); + + it("predicate function condition", () => { + const ser = s([10, 20, 30, 40]); + const res = caseWhen(ser, [[(v) => (v as number) > 25, "big"]]); + expect(res.toArray()).toEqual([10, 20, "big", "big"]); + }); + + it("predicate receives positional index as second arg", () => { + const ser = s([1, 2, 3, 4]); + const indices: number[] = []; + caseWhen(ser, [ + [ + (_v, i) => { + indices.push(i); + return false; + }, + 0, + ], + ]); + expect(indices).toEqual([0, 1, 2, 3]); + }); + + it("boolean array condition", () => { + const ser = s(["a", "b", "c", "d"]); + const res = caseWhen(ser, [[[true, false, false, true], "X"]]); + expect(res.toArray()).toEqual(["X", "b", "c", "X"]); + }); + + it("no condition matches — original value preserved", () => { + const ser = s([1, 2, 3]); + const allFalse = boolS([false, false, false]); + const res = caseWhen(ser, [[allFalse, 99]]); + expect(res.toArray()).toEqual([1, 2, 3]); + }); + + it("null original value preserved when no condition matches", () => { + const ser = s([null, 2, null]); + const allFalse = boolS([false, false, false]); + const res = caseWhen(ser, [[allFalse, 0]]); + expect(res.toArray()).toEqual([null, 2, null]); + }); + + it("handles null in replacement Series", () => { + const ser = s([1, 2, 3]); + const cond = boolS([true, true, true]); + const repl = s([null, null, null]); + const res = caseWhen(ser, [[cond, repl]]); + expect(res.toArray()).toEqual([null, null, null]); + }); + + it("preserves index from source series", () => { + const ser = new Series({ data: [1, 2, 3], index: ["a", "b", "c"] }); + const cond = boolS([true, false, true]); + const res = caseWhen(ser, [[cond, 0]]); + expect(res.index.toArray()).toEqual(["a", "b", "c"]); + }); + + it("all conditions true — first replacement always wins", () => { + const ser = s([1, 2, 3]); + const allTrue = boolS([true, true, true]); + const res = caseWhen(ser, [ + [allTrue, "first"], + [allTrue, "second"], + ]); + expect(res.toArray()).toEqual(["first", "first", "first"]); + }); + + it("mixed types in replacements", () => { + const ser = s([1, 2, 3, 4]); + const cond1 = boolS([true, false, false, false]); + const cond2 = boolS([false, true, false, false]); + const res = caseWhen(ser, [ + [cond1, "text"], + [cond2, 42.5], + ]); + expect(res.toArray()).toEqual(["text", 42.5, 3, 4]); + }); + + it("boolean Series condition with mismatched true values", () => { + const ser = s([10, 20, 30]); + const cond = boolS([false, true, false]); + const res = caseWhen(ser, [[cond, -1]]); + expect(res.toArray()).toEqual([10, -1, 30]); + }); + + it("three branches cover all rows", () => { + const ser = new Series({ data: [1, 5, 10, 15, 20] }); + const d = ser.toArray(); + const lt5 = boolS(d.map((v) => v < 5)); + const lt10 = boolS(d.map((v) => v < 10)); + const lt20 = boolS(d.map((v) => v < 20)); + const res = caseWhen(ser, [ + [lt5, "low"], + [lt10, "mid"], + [lt20, "high"], + ]); + expect(res.toArray()).toEqual(["low", "mid", "high", "high", 20]); + }); +}); + +// ─── edge cases ────────────────────────────────────────────────────────────── + +describe("caseWhen — edge cases", () => { + it("single element series", () => { + const ser = s([42]); + const res = caseWhen(ser, [[boolS([true]), "replaced"]]); + expect(res.toArray()).toEqual(["replaced"]); + }); + + it("empty series", () => { + const ser = s([]); + const res = caseWhen(ser, [[boolS([]), 0]]); + expect(res.toArray()).toEqual([]); + expect(res.length).toBe(0); + }); + + it("string series — text classification", () => { + const ser = s(["apple", "banana", "cherry", "date"]); + const res = caseWhen(ser, [ + [(v) => (v as string).length > 5, "long"], + [(v) => (v as string).length > 4, "medium"], + ]); + expect(res.toArray()).toEqual(["medium", "long", "long", "date"]); + }); + + it("boolean values in series", () => { + const ser = new Series({ data: [true, false, true] }); + const cond = boolS([true, true, false]); + const res = caseWhen(ser, [[cond, null]]); + expect(res.toArray()).toEqual([null, null, true]); + }); + + it("replacement array shorter than series uses null for missing", () => { + // When replacement array is shorter, missing positions yield null + const ser = s([1, 2, 3]); + const cond = boolS([false, false, true]); + const res = caseWhen(ser, [[cond, [10, 20]]]); + // index 2 is true, replacement[2] is undefined → null + expect(res.toArray()).toEqual([1, 2, null]); + }); +}); + +// ─── property-based tests ───────────────────────────────────────────────────── + +describe("caseWhen — property tests", () => { + it("length is always preserved", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -100, max: 100 }), { minLength: 0, maxLength: 20 }), + (data) => { + const ser = new Series({ data: [...data] }); + const cond = boolS(data.map((v) => v > 0)); + const res = caseWhen(ser, [[cond, 999]]); + return res.length === data.length; + }, + ), + ); + }); + + it("empty caselist is identity", () => { + fc.assert( + fc.property( + fc.array(fc.oneof(fc.integer(), fc.constant(null)), { minLength: 0, maxLength: 20 }), + (data) => { + const ser = s(data); + const res = caseWhen(ser, []); + const orig = ser.toArray(); + const got = res.toArray(); + for (let i = 0; i < orig.length; i++) { + if (orig[i] !== got[i]) return false; + } + return true; + }, + ), + ); + }); + + it("all-true condition replaces all values with scalar", () => { + fc.assert( + fc.property( + fc.array(fc.integer(), { minLength: 1, maxLength: 20 }), + fc.integer(), + (data, scalar) => { + const ser = new Series({ data: [...data] }); + const allTrue = boolS(data.map(() => true)); + const res = caseWhen(ser, [[allTrue, scalar]]); + return res.toArray().every((v) => v === scalar); + }, + ), + ); + }); + + it("all-false condition keeps original values", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 1, maxLength: 20 }), (data) => { + const ser = new Series({ data: [...data] }); + const allFalse = boolS(data.map(() => false)); + const res = caseWhen(ser, [[allFalse, 999]]); + const orig = ser.toArray(); + const got = res.toArray(); + for (let i = 0; i < orig.length; i++) { + if (orig[i] !== got[i]) return false; + } + return true; + }), + ); + }); + + it("index is preserved", () => { + fc.assert( + fc.property(fc.array(fc.integer(), { minLength: 1, maxLength: 15 }), (data) => { + const index = data.map((_, i) => `key_${i}`); + const ser = new Series({ data: [...data], index: [...index] }); + const cond = boolS(data.map((v) => v > 0)); + const res = caseWhen(ser, [[cond, 0]]); + return JSON.stringify(res.index.toArray()) === JSON.stringify(index); + }), + ); + }); + + it("predicate condition equivalent to boolean array", () => { + fc.assert( + fc.property( + fc.array(fc.integer({ min: -50, max: 50 }), { minLength: 1, maxLength: 20 }), + (data) => { + const ser = new Series({ data: [...data] }); + const bools = data.map((v) => v > 0); + const res1 = caseWhen(ser, [[boolS(bools), -1]]); + const res2 = caseWhen(ser, [[(v) => (v as number) > 0, -1]]); + const a1 = res1.toArray(); + const a2 = res2.toArray(); + for (let i = 0; i < a1.length; i++) { + if (a1[i] !== a2[i]) return false; + } + return true; + }, + ), + ); + }); +});