1
0

Chapter 2

This commit is contained in:
2025-12-11 19:15:23 -08:00
parent 35f64c16c1
commit 0995feb6ee
4 changed files with 136 additions and 4 deletions

View File

@@ -12,9 +12,11 @@ tokenizer = { workspace = true }
anstyle = { workspace = true } anstyle = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
burn = { workspace = true }
clap = { workspace = true } clap = { workspace = true }
futures-util = { workspace = true } futures-util = { workspace = true }
indicatif = { workspace = true } indicatif = { workspace = true }
ndarray = { workspace = true }
parking_lot = { workspace = true } parking_lot = { workspace = true }
parquet = { workspace = true } parquet = { workspace = true }
rayon = { workspace = true } rayon = { workspace = true }

View File

@@ -1,4 +1,5 @@
mod download; mod download;
mod sample_data;
mod train_tokenizer; mod train_tokenizer;
#[derive(Debug, clap::Subcommand)] #[derive(Debug, clap::Subcommand)]
@@ -14,6 +15,12 @@ pub enum SubCommand {
#[command(flatten)] #[command(flatten)]
args: train_tokenizer::TrainTokenizerArgs, args: train_tokenizer::TrainTokenizerArgs,
}, },
/// Sample data
SampleData {
#[command(flatten)]
args: sample_data::SampleDataArgs,
},
} }
impl SubCommand { impl SubCommand {
@@ -21,6 +28,7 @@ impl SubCommand {
match self { match self {
Self::Download { args } => args.run(mp), Self::Download { args } => args.run(mp),
Self::TrainTokenizer { args } => args.run(mp), Self::TrainTokenizer { args } => args.run(mp),
Self::SampleData { args } => args.run(mp),
} }
} }
} }

View File

@@ -0,0 +1,122 @@
use anyhow::{Context, Result};
use burn::{
Tensor,
backend::{Cuda, cuda::CudaDevice},
nn::{Embedding, EmbeddingConfig},
tensor::Int,
};
use clap::Args;
use indicatif::MultiProgress;
use ndarray::Array2;
use std::{fs::File, path::PathBuf};
use tokenizer::Tokenizer;
use crate::data_reader::DataReader;
#[derive(Debug, Args, Clone)]
pub struct SampleDataArgs {
/// Path to training data
#[clap(long, default_value = "data")]
data_dir: PathBuf,
/// Path to tokenizer
#[clap(long)]
tokenizer: PathBuf,
/// How many texts to return
#[clap(long, short = 'n', default_value = "10")]
n: usize,
/// How many texts to skip
#[clap(long, short = 's', default_value = "0")]
skip: usize,
}
impl SampleDataArgs {
pub fn run(self, _mp: Option<MultiProgress>) -> Result<()> {
let device = CudaDevice::new(0);
let iter = DataReader::new(1, &self.data_dir).context("while initializing data reader")?;
let tokenizer = File::open(self.tokenizer).context("while opening tokenizer")?;
let tokenizer: Tokenizer =
serde_json::from_reader(tokenizer).context("while loading tokenizer")?;
let context_size = 4;
let stride = 4;
// Dimension of each token vector
let embedding_dim = 256;
let batch_size = 10;
let mut input_batch = Vec::with_capacity(batch_size);
let mut output_batch = Vec::with_capacity(batch_size);
#[expect(clippy::unwrap_used)] // Lazy error handling
let iter = iter.map(|x| x.unwrap()).skip(self.skip).take(self.n);
// TODO: what is this?
let tok_embedder = EmbeddingConfig::new(tokenizer.vocab_size() as usize, embedding_dim);
let tok_embedder: Embedding<Cuda> = tok_embedder.init(&device);
let pos_embedder = EmbeddingConfig::new(context_size, embedding_dim);
let pos_embedder: Embedding<Cuda> = pos_embedder.init(&device);
let pos_tensor: Tensor<Cuda, 2, Int> =
Tensor::arange(0..context_size as i64, &device).unsqueeze_dim(0);
// [1, context_size, dim]
let pos_embedding = pos_embedder.forward(pos_tensor);
println!("{:?}", pos_embedding.shape());
for i in iter {
let tokens = tokenizer.encode(&i);
for (a, b) in tokens
.windows(context_size)
.step_by(stride)
.zip(tokens[stride..].windows(context_size).step_by(stride))
{
input_batch.push(a.to_owned());
output_batch.push(b.to_owned());
let context = a;
let desired = &b[b.len() - 1..];
println!("{context:?} -> {desired:?}");
/*
let input = tokenizer.decode(context);
let target = tokenizer.decode(desired);
println!("{input:?} -> {target:?}");
*/
// TODO: last batch
if input_batch.len() >= batch_size {
let shape = [batch_size, context_size];
let input = std::mem::replace(&mut input_batch, Vec::with_capacity(batch_size));
let input: Array2<u32> = Array2::from_shape_fn(shape, |(a, b)| input[a][b]);
let input: Tensor<Cuda, 2, Int> =
Tensor::<_, 1, Int>::from_ints(input.as_slice().unwrap(), &device)
.reshape(shape);
let output =
std::mem::replace(&mut output_batch, Vec::with_capacity(batch_size));
let output: Array2<u32> = Array2::from_shape_fn(shape, |(a, b)| output[a][b]);
let output: Tensor<Cuda, 2, Int> =
Tensor::<_, 1, Int>::from_ints(output.as_slice().unwrap(), &device)
.reshape(shape);
let tok_e = tok_embedder.forward(input);
let tok_e: Tensor<Cuda, 3> = Tensor::from_data(tok_e.to_data(), &device);
let tok_e = tok_e.add(pos_embedding.clone().unsqueeze_dim(0));
}
}
}
Ok(())
}
}

View File

@@ -12,14 +12,14 @@ use crate::data_reader::DataReader;
#[derive(Debug, Args, Clone)] #[derive(Debug, Args, Clone)]
pub struct TrainTokenizerArgs { pub struct TrainTokenizerArgs {
/// Path to training data
#[clap(default_value = "data")]
data_dir: PathBuf,
/// Where to save tokenizer /// Where to save tokenizer
#[clap(default_value = "tokenizer.json")] #[clap(default_value = "tokenizer.json")]
target: PathBuf, target: PathBuf,
/// Path to training data
#[clap(long, default_value = "data")]
data_dir: PathBuf,
/// Only train on the first n texts /// Only train on the first n texts
#[clap(long)] #[clap(long)]
first_n: Option<usize>, first_n: Option<usize>,