Created
March 20, 2021 23:29
-
-
Save lpj145/0f903bfdbac6562a0635089b0f9ef3ea to your computer and use it in GitHub Desktop.
Processar 9 milhoes de linhas csv com rust.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::{collections::HashMap, fs::File, io::{self, BufRead, BufReader}}; | |
use std::time::{SystemTime, UNIX_EPOCH}; | |
use core::time::Duration; | |
pub fn get_unix_timestamp_ms() -> Duration { | |
SystemTime::now() | |
.duration_since(UNIX_EPOCH) | |
.expect("Time went problem.") | |
} | |
pub fn microtime() -> u64 { | |
let start_time = get_unix_timestamp_ms(); | |
start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000 | |
} | |
fn main() -> io::Result<()>{ | |
let start_ms = microtime(); | |
let filepath = "./a.csv"; | |
let file = File::open(filepath).expect("What is ./a.csv file ?"); | |
let reader = BufReader::new(file); | |
let mut consumer_reviews: HashMap<String, u32> = HashMap::new(); | |
for line in reader.lines() { | |
let copy = line?; | |
let indexes = copy.split(','); | |
let mut current_client = String::from(""); | |
for (index, value) in indexes.enumerate() { | |
let _value = String::from(value); | |
// Client index | |
if index == 0 { | |
current_client = _value.clone(); | |
consumer_reviews.entry(_value).or_insert(0); | |
continue; | |
} | |
// Food index | |
if index == 1 { | |
*consumer_reviews.entry(_value).or_insert(0) += 1; | |
continue; | |
} | |
if index == 2 { | |
let price: u32 = _value.parse().unwrap_or(0); | |
*consumer_reviews.entry(current_client.clone()).or_insert(0) += price; | |
} | |
} | |
} | |
println!("Result: {:?}", consumer_reviews); | |
println!("File path: {:?}", filepath); | |
println!("End in: {} ms or {} seconds.", (microtime() - start_ms), (microtime() - start_ms) / 1000); | |
Ok(()) | |
} |
@lpj145 "brinquei" um pouco com o gist e cheguei nessa versão aqui. Percebi um bom ganho de ms. Eu queria fazer uma versão que não precisasse ficar clonando String, mas aí a HashMap precisaria guardar &str e pra fazer isso acho que a gente precisaria manter o input todo na memória de uma vez só, o que deve ficar inviável para inputs maiores.
use core::time::Duration;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{
collections::HashMap,
fs::File,
io::{self, BufRead, BufReader},
};
pub fn get_unix_timestamp_ms() -> Duration {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went problem.")
}
pub fn microtime() -> u64 {
let start_time = get_unix_timestamp_ms();
start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000
}
fn main() -> io::Result<()> {
let start_ms = microtime();
let filepath = "./a.csv";
let file = File::open(filepath).expect("What is ./a.csv file ?");
let reader = BufReader::new(file);
let mut consumer_reviews: HashMap<String, u32> = HashMap::new();
for line in reader.lines() {
let line = line?;
let mut indexes = line.split(',');
let current_client = indexes.next().expect("client missing");
let food = indexes.next().expect("food missing");
*consumer_reviews.entry(food.to_string()).or_insert(0) += 1;
let price: u32 = indexes.next().expect("price missing").parse().unwrap_or(0);
*consumer_reviews
.entry(current_client.to_string())
.or_insert(0) += price;
}
println!("Result: {:?}", consumer_reviews);
println!("File path: {:?}", filepath);
println!(
"End in: {} ms or {} seconds.",
(microtime() - start_ms),
(microtime() - start_ms) / 1000
);
Ok(())
}
Versão que clona os &str somente quando precisa inserir uma nova entrada na HashMap:
use core::time::Duration;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{
collections::HashMap,
fs::File,
io::{self, BufRead, BufReader},
};
pub fn get_unix_timestamp_ms() -> Duration {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went problem.")
}
pub fn microtime() -> u64 {
let start_time = get_unix_timestamp_ms();
start_time.as_secs() * 1000 + start_time.subsec_nanos() as u64 / 1_000_000
}
fn main() -> io::Result<()> {
let start_ms = microtime();
let filepath = "./a.csv";
let file = File::open(filepath).expect("What is ./a.csv file ?");
let reader = BufReader::new(file);
let mut consumer_reviews: HashMap<String, u32> = HashMap::new();
for line in reader.lines() {
let line = line?;
let mut indexes = line.split(',');
let current_client = indexes.next().expect("client missing");
let food = indexes.next().expect("food missing");
match consumer_reviews.get_mut(food) {
Some(value) => {
*value += 1;
}
None => {
consumer_reviews.insert(food.to_string(), 0);
}
}
let price: u32 = indexes.next().expect("price missing").parse().unwrap_or(0);
match consumer_reviews.get_mut(current_client) {
Some(value) => {
*value += price;
}
None => {
consumer_reviews.insert(current_client.to_string(), 0);
}
}
}
println!("Result: {:?}", consumer_reviews);
println!("File path: {:?}", filepath);
println!(
"End in: {} ms or {} seconds.",
(microtime() - start_ms),
(microtime() - start_ms) / 1000
);
Ok(())
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
este é o exemplo 🆙