Run language models locally in browser and Node.js with WebGPU acceleration
tinylm provides an OpenAI-compatible API for running language models directly in your browser or Node.js application using WebGPU acceleration. No server required, zero-cost inference, and complete privacy with client-side processing.
npm install tinylm
# or
yarn add tinylm
import { TinyLM } from "tinylm";
// Create a TinyLM instance
const tiny = new TinyLM();
// Initialize and load a model
await tiny.init({
models: ["HuggingFaceTB/SmolLM2-135M-Instruct"],
});
// Generate a completion
const response = await tiny.chat.completions.create({
messages: [
{ role: "system", content: "You are a helpful AI assistant." },
{ role: "user", content: "What is artificial intelligence?" },
],
temperature: 0.7,
max_tokens: 100,
});
console.log(response.choices[0].message.content);
↑ This example demonstrates basic text generation with tinylm.
import { TinyLM } from "tinylm";
const tiny = new TinyLM();
await tiny.init({
embeddingModels: ["nomic-ai/nomic-embed-text-v1.5"],
});
// Generate embeddings for text
const embedding = await tiny.embeddings.create({
model: "nomic-ai/nomic-embed-text-v1.5",
input: "Your text string goes here",
});
console.log(`Embedding dimensions: ${embedding.data[0].embedding.length}`);
console.log(`Token usage: ${embedding.usage.prompt_tokens} tokens`);
↑ Generate embeddings locally for semantic search and other applications.
import { TinyLM } from "tinylm";
const tiny = new TinyLM();
await tiny.init();
await tiny.models.load({ model: "HuggingFaceTB/SmolLM2-135M-Instruct" });
// Generate a streaming response
const stream = await tiny.chat.completions.create({
messages: [
{ role: "system", content: "You are a creative storyteller." },
{ role: "user", content: "Write a short poem about technology." },
],
temperature: 0.9,
max_tokens: 200,
stream: true, // Enable streaming
});
// Process the stream
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || "";
process.stdout.write(content); // Display content as it arrives
}
import { TinyLM } from "tinylm";
// Format bytes to human-readable size
function formatBytes(bytes) {
if (bytes === 0 || !bytes) return "0 B";
const sizes = ["B", "KB", "MB", "GB"];
const i = Math.floor(Math.log(bytes) / Math.log(1024));
return `${(bytes / Math.pow(1024, i)).toFixed(2)} ${sizes[i]}`;
}
// Create TinyLM with detailed progress tracking
const tiny = new TinyLM({
progressCallback: (progress) => {
if (progress.type === "model" && progress.overall) {
const { bytesLoaded, bytesTotal, percentComplete, speed } =
progress.overall;
console.log(
`Loading model: ${percentComplete}% - ` +
`${formatBytes(bytesLoaded)}/${formatBytes(bytesTotal)} ` +
`at ${formatBytes(speed)}/s`
);
// Log individual file progress
if (progress.files && progress.files.length > 0) {
const activeFiles = progress.files.filter((f) => f.status !== "done");
if (activeFiles.length > 0) {
console.log(`Active downloads: ${activeFiles.length}`);
activeFiles.forEach((file) => {
console.log(` ${file.name}: ${file.percentComplete}%`);
});
}
}
}
},
});
await tiny.init();
await tiny.models.load({ model: "HuggingFaceTB/SmolLM2-135M-Instruct" });
import { TinyLM } from "tinylm";
// Create TinyLM instance
const tiny = new TinyLM();
await tiny.init();
// Set up a document collection
const documents = [
"Artificial intelligence is rapidly transforming technology",
"Machine learning models require large datasets to train properly",
"Neural networks are loosely inspired by the human brain",
"The climate crisis requires immediate global action",
"Renewable energy sources are crucial for sustainability",
"Good programming practices improve code maintainability",
];
// Function to calculate cosine similarity
function cosineSimilarity(a, b) {
let dotProduct = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
}
return dotProduct; // Vectors are already normalized
}
// Index the documents by generating embeddings
console.log("Generating embeddings for documents...");
const documentsEmbeddings = await tiny.embeddings.create({
model: "Xenova/all-MiniLM-L6-v2",
input: documents,
});
const documentVectors = documentsEmbeddings.data.map((d) => d.embedding);
// Create a search function
async function semanticSearch(query, topK = 2) {
// Generate embedding for the query
const queryEmbedding = await tiny.embeddings.create({
model: "Xenova/all-MiniLM-L6-v2",
input: query,
});
const queryVector = queryEmbedding.data[0].embedding;
// Compare to all documents
const similarities = documentVectors.map((docVector, i) => {
return {
document: documents[i],
score: cosineSimilarity(queryVector, docVector),
};
});
// Sort by similarity (descending)
similarities.sort((a, b) => b.score - a.score);
// Return top K results
return similarities.slice(0, topK);
}
constructor(options)
: Create a new TinyLM instance
progressCallback
: Function called with progress updatesprogressThrottleTime
: Milliseconds between progress updates (default: 100)init(options)
: Initialize TinyLM with optional model preloading
models
: Text generation models to preloadembeddingModels
: Embedding models to preloadlazyLoad
: Don't load models immediately (default: false)chat.completions.create(options)
: Generate text completions with an OpenAI-compatible interface
messages
: Array of message objectsmodel
: Optional if already loadedtemperature
: Controls randomness (0-1)max_tokens
: Maximum tokens to generatestream
: Set to true for streamingembeddings.create(options)
: Generate embeddings for text with an OpenAI-compatible interface
model
: Embedding model to useinput
: Single string or array of stringsencoding_format
: 'float' (default) or 'base64'dimensions
: Optional: specify desired dimensionsmodels.load(options)
: Load a model for use
model
: Model identifierquantization
: Optional quantization levelmodels.offload(options)
: Unload a model to free memorymodels.list()
: List all currently loaded modelsmodels.check()
: Check hardware capabilities for WebGPU accelerationmodels.interrupt()
: Interrupt an ongoing generationmodels.reset()
: Reset the generation statetinylm works directly in both browser and Node.js environments with a consistent API: