Chapter 2
This commit is contained in:
@@ -12,9 +12,11 @@ tokenizer = { workspace = true }
|
||||
|
||||
anstyle = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
burn = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
futures-util = { workspace = true }
|
||||
indicatif = { workspace = true }
|
||||
ndarray = { workspace = true }
|
||||
parking_lot = { workspace = true }
|
||||
parquet = { workspace = true }
|
||||
rayon = { workspace = true }
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
mod download;
|
||||
mod sample_data;
|
||||
mod train_tokenizer;
|
||||
|
||||
#[derive(Debug, clap::Subcommand)]
|
||||
@@ -14,6 +15,12 @@ pub enum SubCommand {
|
||||
#[command(flatten)]
|
||||
args: train_tokenizer::TrainTokenizerArgs,
|
||||
},
|
||||
|
||||
/// Sample data
|
||||
SampleData {
|
||||
#[command(flatten)]
|
||||
args: sample_data::SampleDataArgs,
|
||||
},
|
||||
}
|
||||
|
||||
impl SubCommand {
|
||||
@@ -21,6 +28,7 @@ impl SubCommand {
|
||||
match self {
|
||||
Self::Download { args } => args.run(mp),
|
||||
Self::TrainTokenizer { args } => args.run(mp),
|
||||
Self::SampleData { args } => args.run(mp),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
122
crates/llmfs/src/command/sample_data.rs
Normal file
122
crates/llmfs/src/command/sample_data.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
use anyhow::{Context, Result};
|
||||
use burn::{
|
||||
Tensor,
|
||||
backend::{Cuda, cuda::CudaDevice},
|
||||
nn::{Embedding, EmbeddingConfig},
|
||||
tensor::Int,
|
||||
};
|
||||
use clap::Args;
|
||||
use indicatif::MultiProgress;
|
||||
use ndarray::Array2;
|
||||
use std::{fs::File, path::PathBuf};
|
||||
use tokenizer::Tokenizer;
|
||||
|
||||
use crate::data_reader::DataReader;
|
||||
|
||||
#[derive(Debug, Args, Clone)]
|
||||
|
||||
pub struct SampleDataArgs {
|
||||
/// Path to training data
|
||||
#[clap(long, default_value = "data")]
|
||||
data_dir: PathBuf,
|
||||
|
||||
/// Path to tokenizer
|
||||
#[clap(long)]
|
||||
tokenizer: PathBuf,
|
||||
|
||||
/// How many texts to return
|
||||
#[clap(long, short = 'n', default_value = "10")]
|
||||
n: usize,
|
||||
|
||||
/// How many texts to skip
|
||||
#[clap(long, short = 's', default_value = "0")]
|
||||
skip: usize,
|
||||
}
|
||||
|
||||
impl SampleDataArgs {
|
||||
pub fn run(self, _mp: Option<MultiProgress>) -> Result<()> {
|
||||
let device = CudaDevice::new(0);
|
||||
|
||||
let iter = DataReader::new(1, &self.data_dir).context("while initializing data reader")?;
|
||||
|
||||
let tokenizer = File::open(self.tokenizer).context("while opening tokenizer")?;
|
||||
let tokenizer: Tokenizer =
|
||||
serde_json::from_reader(tokenizer).context("while loading tokenizer")?;
|
||||
|
||||
let context_size = 4;
|
||||
let stride = 4;
|
||||
|
||||
// Dimension of each token vector
|
||||
let embedding_dim = 256;
|
||||
|
||||
let batch_size = 10;
|
||||
let mut input_batch = Vec::with_capacity(batch_size);
|
||||
let mut output_batch = Vec::with_capacity(batch_size);
|
||||
|
||||
#[expect(clippy::unwrap_used)] // Lazy error handling
|
||||
let iter = iter.map(|x| x.unwrap()).skip(self.skip).take(self.n);
|
||||
|
||||
// TODO: what is this?
|
||||
let tok_embedder = EmbeddingConfig::new(tokenizer.vocab_size() as usize, embedding_dim);
|
||||
let tok_embedder: Embedding<Cuda> = tok_embedder.init(&device);
|
||||
|
||||
let pos_embedder = EmbeddingConfig::new(context_size, embedding_dim);
|
||||
let pos_embedder: Embedding<Cuda> = pos_embedder.init(&device);
|
||||
|
||||
let pos_tensor: Tensor<Cuda, 2, Int> =
|
||||
Tensor::arange(0..context_size as i64, &device).unsqueeze_dim(0);
|
||||
|
||||
// [1, context_size, dim]
|
||||
let pos_embedding = pos_embedder.forward(pos_tensor);
|
||||
|
||||
println!("{:?}", pos_embedding.shape());
|
||||
|
||||
for i in iter {
|
||||
let tokens = tokenizer.encode(&i);
|
||||
|
||||
for (a, b) in tokens
|
||||
.windows(context_size)
|
||||
.step_by(stride)
|
||||
.zip(tokens[stride..].windows(context_size).step_by(stride))
|
||||
{
|
||||
input_batch.push(a.to_owned());
|
||||
output_batch.push(b.to_owned());
|
||||
|
||||
let context = a;
|
||||
let desired = &b[b.len() - 1..];
|
||||
|
||||
println!("{context:?} -> {desired:?}");
|
||||
|
||||
/*
|
||||
let input = tokenizer.decode(context);
|
||||
let target = tokenizer.decode(desired);
|
||||
println!("{input:?} -> {target:?}");
|
||||
*/
|
||||
|
||||
// TODO: last batch
|
||||
if input_batch.len() >= batch_size {
|
||||
let shape = [batch_size, context_size];
|
||||
|
||||
let input = std::mem::replace(&mut input_batch, Vec::with_capacity(batch_size));
|
||||
let input: Array2<u32> = Array2::from_shape_fn(shape, |(a, b)| input[a][b]);
|
||||
let input: Tensor<Cuda, 2, Int> =
|
||||
Tensor::<_, 1, Int>::from_ints(input.as_slice().unwrap(), &device)
|
||||
.reshape(shape);
|
||||
|
||||
let output =
|
||||
std::mem::replace(&mut output_batch, Vec::with_capacity(batch_size));
|
||||
let output: Array2<u32> = Array2::from_shape_fn(shape, |(a, b)| output[a][b]);
|
||||
let output: Tensor<Cuda, 2, Int> =
|
||||
Tensor::<_, 1, Int>::from_ints(output.as_slice().unwrap(), &device)
|
||||
.reshape(shape);
|
||||
|
||||
let tok_e = tok_embedder.forward(input);
|
||||
let tok_e: Tensor<Cuda, 3> = Tensor::from_data(tok_e.to_data(), &device);
|
||||
let tok_e = tok_e.add(pos_embedding.clone().unsqueeze_dim(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -12,14 +12,14 @@ use crate::data_reader::DataReader;
|
||||
#[derive(Debug, Args, Clone)]
|
||||
|
||||
pub struct TrainTokenizerArgs {
|
||||
/// Path to training data
|
||||
#[clap(default_value = "data")]
|
||||
data_dir: PathBuf,
|
||||
|
||||
/// Where to save tokenizer
|
||||
#[clap(default_value = "tokenizer.json")]
|
||||
target: PathBuf,
|
||||
|
||||
/// Path to training data
|
||||
#[clap(long, default_value = "data")]
|
||||
data_dir: PathBuf,
|
||||
|
||||
/// Only train on the first n texts
|
||||
#[clap(long)]
|
||||
first_n: Option<usize>,
|
||||
|
||||
Reference in New Issue
Block a user