From 4430247d40c6f9994e4c7860396c566521b2c604 Mon Sep 17 00:00:00 2001 From: rm-dr <96270320+rm-dr@users.noreply.github.com> Date: Thu, 11 Dec 2025 19:15:23 -0800 Subject: [PATCH] Chapter 2 --- crates/llmfs/Cargo.toml | 2 + crates/llmfs/src/command/mod.rs | 8 ++ crates/llmfs/src/command/sample_data.rs | 122 ++++++++++++++++++++ crates/llmfs/src/command/train_tokenizer.rs | 8 +- 4 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 crates/llmfs/src/command/sample_data.rs diff --git a/crates/llmfs/Cargo.toml b/crates/llmfs/Cargo.toml index 4929ced..69bd819 100644 --- a/crates/llmfs/Cargo.toml +++ b/crates/llmfs/Cargo.toml @@ -12,9 +12,11 @@ tokenizer = { workspace = true } anstyle = { workspace = true } anyhow = { workspace = true } +burn = { workspace = true } clap = { workspace = true } futures-util = { workspace = true } indicatif = { workspace = true } +ndarray = { workspace = true } parking_lot = { workspace = true } parquet = { workspace = true } rayon = { workspace = true } diff --git a/crates/llmfs/src/command/mod.rs b/crates/llmfs/src/command/mod.rs index 290a636..779132d 100644 --- a/crates/llmfs/src/command/mod.rs +++ b/crates/llmfs/src/command/mod.rs @@ -1,4 +1,5 @@ mod download; +mod sample_data; mod train_tokenizer; #[derive(Debug, clap::Subcommand)] @@ -14,6 +15,12 @@ pub enum SubCommand { #[command(flatten)] args: train_tokenizer::TrainTokenizerArgs, }, + + /// Sample data + SampleData { + #[command(flatten)] + args: sample_data::SampleDataArgs, + }, } impl SubCommand { @@ -21,6 +28,7 @@ impl SubCommand { match self { Self::Download { args } => args.run(mp), Self::TrainTokenizer { args } => args.run(mp), + Self::SampleData { args } => args.run(mp), } } } diff --git a/crates/llmfs/src/command/sample_data.rs b/crates/llmfs/src/command/sample_data.rs new file mode 100644 index 0000000..e7ffd93 --- /dev/null +++ b/crates/llmfs/src/command/sample_data.rs @@ -0,0 +1,122 @@ +use anyhow::{Context, Result}; +use burn::{ + Tensor, + backend::{Cuda, cuda::CudaDevice}, + nn::{Embedding, EmbeddingConfig}, + tensor::Int, +}; +use clap::Args; +use indicatif::MultiProgress; +use ndarray::Array2; +use std::{fs::File, path::PathBuf}; +use tokenizer::Tokenizer; + +use crate::data_reader::DataReader; + +#[derive(Debug, Args, Clone)] + +pub struct SampleDataArgs { + /// Path to training data + #[clap(long, default_value = "data")] + data_dir: PathBuf, + + /// Path to tokenizer + #[clap(long)] + tokenizer: PathBuf, + + /// How many texts to return + #[clap(long, short = 'n', default_value = "10")] + n: usize, + + /// How many texts to skip + #[clap(long, short = 's', default_value = "0")] + skip: usize, +} + +impl SampleDataArgs { + pub fn run(self, _mp: Option) -> Result<()> { + let device = CudaDevice::new(0); + + let iter = DataReader::new(1, &self.data_dir).context("while initializing data reader")?; + + let tokenizer = File::open(self.tokenizer).context("while opening tokenizer")?; + let tokenizer: Tokenizer = + serde_json::from_reader(tokenizer).context("while loading tokenizer")?; + + let context_size = 4; + let stride = 4; + + // Dimension of each token vector + let embedding_dim = 256; + + let batch_size = 10; + let mut input_batch = Vec::with_capacity(batch_size); + let mut output_batch = Vec::with_capacity(batch_size); + + #[expect(clippy::unwrap_used)] // Lazy error handling + let iter = iter.map(|x| x.unwrap()).skip(self.skip).take(self.n); + + // TODO: what is this? + let tok_embedder = EmbeddingConfig::new(tokenizer.vocab_size() as usize, embedding_dim); + let tok_embedder: Embedding = tok_embedder.init(&device); + + let pos_embedder = EmbeddingConfig::new(context_size, embedding_dim); + let pos_embedder: Embedding = pos_embedder.init(&device); + + let pos_tensor: Tensor = + Tensor::arange(0..context_size as i64, &device).unsqueeze_dim(0); + + // [1, context_size, dim] + let pos_embedding = pos_embedder.forward(pos_tensor); + + println!("{:?}", pos_embedding.shape()); + + for i in iter { + let tokens = tokenizer.encode(&i); + + for (a, b) in tokens + .windows(context_size) + .step_by(stride) + .zip(tokens[stride..].windows(context_size).step_by(stride)) + { + input_batch.push(a.to_owned()); + output_batch.push(b.to_owned()); + + let context = a; + let desired = &b[b.len() - 1..]; + + println!("{context:?} -> {desired:?}"); + + /* + let input = tokenizer.decode(context); + let target = tokenizer.decode(desired); + println!("{input:?} -> {target:?}"); + */ + + // TODO: last batch + if input_batch.len() >= batch_size { + let shape = [batch_size, context_size]; + + let input = std::mem::replace(&mut input_batch, Vec::with_capacity(batch_size)); + let input: Array2 = Array2::from_shape_fn(shape, |(a, b)| input[a][b]); + let input: Tensor = + Tensor::<_, 1, Int>::from_ints(input.as_slice().unwrap(), &device) + .reshape(shape); + + let output = + std::mem::replace(&mut output_batch, Vec::with_capacity(batch_size)); + let output: Array2 = Array2::from_shape_fn(shape, |(a, b)| output[a][b]); + let output: Tensor = + Tensor::<_, 1, Int>::from_ints(output.as_slice().unwrap(), &device) + .reshape(shape); + + let tok_e = tok_embedder.forward(input); + let tok_e: Tensor = Tensor::from_data(tok_e.to_data(), &device); + let tok_e = tok_e.add(pos_embedding.clone().unsqueeze_dim(0)); + } + } + } + + Ok(()) + } +} diff --git a/crates/llmfs/src/command/train_tokenizer.rs b/crates/llmfs/src/command/train_tokenizer.rs index 8e74b8d..b91c125 100644 --- a/crates/llmfs/src/command/train_tokenizer.rs +++ b/crates/llmfs/src/command/train_tokenizer.rs @@ -12,14 +12,14 @@ use crate::data_reader::DataReader; #[derive(Debug, Args, Clone)] pub struct TrainTokenizerArgs { - /// Path to training data - #[clap(default_value = "data")] - data_dir: PathBuf, - /// Where to save tokenizer #[clap(default_value = "tokenizer.json")] target: PathBuf, + /// Path to training data + #[clap(long, default_value = "data")] + data_dir: PathBuf, + /// Only train on the first n texts #[clap(long)] first_n: Option,