Commit dd27f452 authored by Valentin Antuori's avatar Valentin Antuori
Browse files

improve Reinforce time computation

parent 34f7dd28
......@@ -24,6 +24,19 @@ double temperature = 1.0;
bool debug_flag = false;
bool trace_flag = false;
double to_tensor_time_global = 0.0;
double forward_time_global = 0.0;
double _forward_forward = 0.0;
double _forward_softmax = 0.0;
double _forward_rest = 0.0;
double rest_time_global = 0.0;
double to_vector_for1 = 0.0;
double to_vector_for2 = 0.0;
void set_temperature(double t)
{
temperature = t;
......@@ -128,6 +141,7 @@ class State_rl
Tensor to_tensor()
{
auto tick = std::chrono::steady_clock::now();
mapping_act2task.clear();
for(int comp = 0; comp < data.nb_components; ++comp){
......@@ -151,11 +165,15 @@ class State_rl
// int counter = 0;
int size = mapping_act2task.size();
torch::Tensor t = torch::empty({size, 4}, torch::kFloat);
//torch::Tensor t = torch::empty({size, 4}, torch::kFloat);
// std::cout << "candidat : " << std::endl;
if(trace_flag){
std::cout << "candidate : " << std::endl;
}
std::vector<float> v(size*4);
int acc = 0;
for(uint i = 0; i < mapping_act2task.size(); ++i){
int task = mapping_act2task[i];
int comp = data.component(task);
......@@ -167,18 +185,27 @@ class State_rl
int slack = lst[comp] - std::max(data.release_date(task), (time - tardiness) + dist);
int pick_1 = data.is_pickup(task) ? 1 : 0;
t[i][0] = (float)slack / data.max_slack;
t[i][1] = (float)std::max(dist, data.release_date(task) - (time - tardiness)) / data.max_dist;
t[i][2] = 1 - (float)data.trolley_length(task) / data.T_max;
t[i][3] = (float)pick_1;
v[acc++] = (float)slack / data.max_slack;
v[acc++] = (float)std::max(dist, data.release_date(task) - (time - tardiness)) / data.max_dist;
v[acc++] = 1 - (float)data.trolley_length(task) / data.T_max;
v[acc++] = (float)pick_1;
if(trace_flag){
std::cout << mapping_act2task[i] << "(" << (float)slack / data.max_slack << " " << (float)std::max(dist, data.release_date(task) - (time - tardiness)) / data.max_dist
<< " " << 1-(float)data.trolley_length(task) / data.T_max << " " << pick_1 << ")" << std::endl;
}
}
return t;
to_vector_for1 += ((std::chrono::duration<double>)(std::chrono::steady_clock::now()-tick)).count();
tick = std::chrono::steady_clock::now();
auto options = torch::TensorOptions().dtype(at::kFloat);
auto tensor = torch::from_blob(v.data(), {1, size*4}, options).clone();
tensor = tensor.reshape({size, 4});
to_vector_for2 += ((std::chrono::duration<double>)(std::chrono::steady_clock::now()-tick)).count();
return tensor;
}
......@@ -191,38 +218,39 @@ class State_rl
vector<int> mapping_act2task;
};
struct Net : torch::nn::Module {
Net(int64_t N, int64_t M)
: linear(register_module("linear", torch::nn::Linear(N, M))) {
}
torch::Tensor forward(torch::Tensor input) {
return linear(input);
}
torch::nn::Linear linear;
};
// Define a new Module.
// struct Net : torch::nn::Module {
// Net() {
// // Construct and register two Linear submodules.
// fc1 = register_module("lin1", nn::Linear(nn::LinearOptions(4, 1).bias(false)));
// }
// // Implement the Net's algorithm.
// torch::Tensor forward(torch::Tensor x) {
// // Use one of many tensor manipulation functions.
// x = fc1->forward(x);
// x = torch::softmax((1-tmp)/temperature, /*dim=*/0);
// return x;
// }
// // Use one of many "standard library" modules.
// torch::nn::Linear fc1{nullptr};
// };
// Given an environment, construct a policy.
nn::Sequential construct_policy(const torch::Device &device)
{
vector<float> theta({1.0034868901129597, 2.3039372263365396, 0.5926475075479015, 0.095393968026037});
//vector<float> theta({1.0034868901129597, 2.3039372263365396, 0.5926475075479015, 0.095393968026037});
// const auto state_space = data.nb_components*4;
// const auto hidden_units = data.nb_components;
// const auto action_space = data.nb_components;
auto policy = nn::Sequential(//nn::Linear(nn::LinearOptions(4, 4)),
//nn::Functional(torch::relu),
nn::Linear(nn::LinearOptions(4, 1).bias(false)));
// nn::Functional(torch::relu),
// nn::Linear(hidden_units, hidden_units),
// nn::Functional(torch::relu),
// nn::Linear(hidden_units, hidden_units),
// nn::Functional(torch::relu),
// nn::Linear(hidden_units, hidden_units),
// nn::Functional(torch::relu),
// nn::Linear(hidden_units, action_space));
auto policy = nn::Sequential(
nn::Linear(nn::LinearOptions(4, 1).bias(false))
);
policy->to(device);
return policy;
}
......@@ -232,15 +260,15 @@ int choice(const Tensor &probs)
auto roll = torch::rand(1).item<float>();
auto save_roll = roll;
// std::cout << roll << std::endl;
// std::cout << probs << std::endl;
size_t size = probs.numel();
auto p = probs.data_ptr<float>();
for (int i = 0; i < probs.size(0); ++i) {
const auto prob = probs[i].item<float>();
if (roll <= prob)
for(size_t i = 0; i < size; ++i)
{
if (roll <= p[i])
return i;
else
roll -= prob;
roll -= p[i];
}
std::cerr << save_roll << " --> " << roll << std::endl;
......@@ -249,20 +277,19 @@ int choice(const Tensor &probs)
/*
Select an action using the policy given the curren state.
Return the action and the log probability of taking that action.
*/
bool display_probs = false;
std::tuple<int, Tensor> select_action(nn::Sequential &policy, const Tensor &state, const State_rl& s)
{
// std::cout << "State_rl : " << std::endl << state << std::endl;
auto tick = std::chrono::steady_clock::now();
const auto tmp = policy->forward(state);
_forward_forward += ((std::chrono::duration<double>)(std::chrono::steady_clock::now() - tick)).count();
// std::cout << "Params :"<<std::endl << policy->parameters() << std::endl;
// std::cout << "NN out :"<<std::endl << tmp << std::endl;
tick = std::chrono::steady_clock::now();
const auto probs = torch::softmax((1-tmp)/temperature, /*dim=*/0);
// const auto valid_probs = s.remove_invalid_moves(probs);
// std::cout << "Proba 2 : " << std::endl << valid_probs << std::endl;
_forward_softmax += ((std::chrono::duration<double>)(std::chrono::steady_clock::now() - tick)).count();
if(trace_flag){
std::cout << "Proba : ";
......@@ -277,8 +304,7 @@ std::tuple<int, Tensor> select_action(nn::Sequential &policy, const Tensor &stat
std::cout << std::endl;
}
tick = std::chrono::steady_clock::now();
std::tuple<int, Tensor> return_value;
try {
const auto action = choice(probs);
......@@ -289,6 +315,7 @@ std::tuple<int, Tensor> select_action(nn::Sequential &policy, const Tensor &stat
cout << probs << std::endl;
exit(0);
}
_forward_rest += ((std::chrono::duration<double>)(std::chrono::steady_clock::now() - tick)).count();
return return_value;
}
/*
......@@ -302,28 +329,31 @@ std::tuple<Tensor, Tensor, Solution > play_episode(nn::Sequential &policy, const
auto rewards = std::vector<float>{};
auto log_probs = std::vector<Tensor>{};
Solution sol(data);
Solution sol(&data);
State_rl s(data);
// double min_reward = 0;
for(int j = 0; j < data.nb_tasks; ++j){
auto tick = std::chrono::steady_clock::now();
std::chrono::duration<double> to_tensor_time(0);
std::chrono::duration<double> forward_time(0);
std::chrono::duration<double> rest_time(0);
for(int j = 0; j < data.nb_tasks; ++j){
if(trace_flag){
std::cout << std::endl << " ------ Step "<< j << " ------" << std::endl;
}
tick = std::chrono::steady_clock::now();
auto state_tensor = s.to_tensor().to(device);
to_tensor_time += (std::chrono::steady_clock::now() - tick);
auto [action, log_prob] = select_action(policy, s.to_tensor().to(device), s);
// if(s.nexts[action] < 0)
// {
// rewards.assign(rewards.size()+1, 0);
// rewards.back() = -1;
// // log_probs = std::vector<Tensor>{};
// log_probs.push_back(std::move(log_prob));
// break;
// }
tick = std::chrono::steady_clock::now();
auto [action, log_prob] = select_action(policy, state_tensor, s);
forward_time += (std::chrono::steady_clock::now() - tick);
tick = std::chrono::steady_clock::now();
int selected_task = s.action_to_task(action);
sol.append(selected_task);
......@@ -340,10 +370,14 @@ std::tuple<Tensor, Tensor, Solution > play_episode(nn::Sequential &policy, const
// std::cout << "State_rl : " << std::endl;
// std::cout << sol << std::endl;
rest_time += (std::chrono::steady_clock::now() - tick);
}
// std::cout << "Episode time : " << std::endl;
to_tensor_time_global += to_tensor_time.count();
forward_time_global += forward_time.count();
rest_time_global += rest_time.count();
return std::make_tuple(torch::tensor(std::move(rewards)),
torch::stack(std::move(log_probs)), sol);
}
......@@ -351,13 +385,23 @@ std::tuple<Tensor, Tensor, Solution > play_episode(nn::Sequential &policy, const
// Discount the rewards by gamma.
Tensor discount(const Tensor &rewards, float gamma)
{
auto discounted = torch::zeros_like(rewards);
auto running_sum = 0.0;
for (int i = rewards.size(0) - 1; i >= 0; --i) {
running_sum = rewards[i].item<float>() + gamma * running_sum;
discounted[i] = running_sum;
int size = rewards.numel();
auto p = rewards.data_ptr<float>();
std::vector<float> v(size);
float running_sum = 0;
for (int i = size-1; i >= 0; --i) {
running_sum = p[i] + gamma * running_sum;
v[i] = running_sum;
}
return discounted;
auto options = torch::TensorOptions().dtype(at::kFloat);
auto tensor = torch::from_blob(v.data(), {1, size}, options).clone();
return tensor;
}
auto normalize(const Tensor &rewards) -> Tensor
......@@ -375,6 +419,8 @@ void improve_policy(nn::Sequential &policy, const vector<Instance>& data, const
std::cout << "batch size = " << real_batch_size << std::endl;
for (int i = 0; i < episodes; ++i) {
std::cout << std::endl <<" ------------------- " << std::endl;
// if(i == 28){
// trace_flag = true;
// }
......@@ -382,6 +428,7 @@ void improve_policy(nn::Sequential &policy, const vector<Instance>& data, const
double lmax_sum = 0;
torch::Tensor loss_batch = torch::zeros({1}, torch::kFloat);
std::chrono::duration<double> loss_computation(0);
for(int j = 0; j < real_batch_size; ++j){
if(trace_flag){
std::cout << " ========== " << std::endl;
......@@ -391,13 +438,14 @@ void improve_policy(nn::Sequential &policy, const vector<Instance>& data, const
for(auto inst : data){
const auto [_rewards, _log_probs, _sol] = play_episode(policy, inst, device);
auto begin = std::chrono::steady_clock::now();
lmax_sum += _sol.lmax();
// log_probs[j] = _log_probs;
// rewards.add(discount(_rewards, /*gamma=*/1));
auto returns = normalize(
auto returns = //normalize(
discount(_rewards, /*gamma=*/1)
)
//)
.to(device);
......@@ -409,24 +457,43 @@ void improve_policy(nn::Sequential &policy, const vector<Instance>& data, const
std::cout << "loss_t = " << loss_t << std::endl;
}
loss_batch.add_(loss_t);
auto end = std::chrono::steady_clock::now();
loss_computation+= (end - begin);
}
}
std::cout << "to_tensor_time_global = " << to_tensor_time_global << std::endl;
std::cout << " - To tensor for : " << to_vector_for1 << std::endl;
std::cout << " - To tensor tensor : " << to_vector_for2 << std::endl;
std::cout << "forward_time_global = " << forward_time_global << std::endl;
std::cout << " - In forward forward : " << _forward_forward << std::endl;
std::cout << " - In forward softmax : " << _forward_softmax << std::endl;
std::cout << " - In forward rest : " << _forward_rest << std::endl;
std::cout << "rest_time_global = " << rest_time_global << std::endl;
std::cout << "loss_computation = " << loss_computation.count() << std::endl;
auto begin = std::chrono::steady_clock::now();
optimizer.zero_grad();
auto loss = loss_batch / (real_batch_size * static_cast<int>(data.size()));
std::cout << "Step " << i << " : " << lmax_sum/(real_batch_size * static_cast<int>(data.size())) << std::endl;
std::cout << "Param : " << policy->parameters() << std::endl << std::endl;
std::cout << "Backward..." << std::endl;
loss.backward();
std::cout << "Optimizer step..." << std::endl;
optimizer.step();
std::cout << " ===> Param : " << policy->parameters() << std::endl;
std::cout << " ===> Param : " << policy->parameters() << std::endl << std::endl;
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> backwards_duration = end - begin;
std::cout << "Back prop duration : " << backwards_duration.count() << std::endl;
if(debug_flag)
{
......
......@@ -12,7 +12,7 @@ extern bool trace_flag;
void set_temperature(double t);
torch::nn::Sequential construct_policy(const torch::Device &device);
void improve_policy(torch::nn::Sequential &policy, const vector<Instance>& data, const torch::Device &device, torch::optim::Optimizer &optimizer,
void improve_policy(torch::nn::Sequential &policy, const std::vector<Instance>& data, const torch::Device &device, torch::optim::Optimizer &optimizer,
int episodes = 100, int batch_size = 1);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment