Custom deep reinforcement learning training process, the code does not report errors, but during the training process it can not be read, just like the data

3 views (last 30 days)
rng(0)
env = rlPredefinedEnv('CartPoleSimscapeModel-Continuous');
%Extract the observation and action specifications from the environment.
obsInfo = getObservationInfo(env);
actInfo = getActionInfo(env);
%Obtain the number of observations (numObs) and actions (numAct).
numObs = obsInfo.Dimension(1);
numAct = actInfo.Dimension(1);
%Set a sample time for the environment
Ts = 0.01;
Tf=20;
%%
%network
criticLayerSizes = [128 200];
actorLayerSizes = [128 200];
%createNetworkWeights;
statePath = [
featureInputLayer(numObs,'Normalization','none','Name','observation')
fullyConnectedLayer(128,'Name','CriticStateFC1')
reluLayer('Name','CriticRelu1')
fullyConnectedLayer(200,'Name','CriticStateFC2')];
actionPath = [
featureInputLayer(1,'Normalization','none','Name','action')
fullyConnectedLayer(200,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
additionLayer(2,'Name','add')
reluLayer('Name','CriticCommonRelu')
fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph(statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
criticNetwork = dlnetwork(criticNetwork);
% Create the critic function approximator.
criticOptions = rlOptimizerOptions('LearnRate',1e-03,'GradientThreshold',1);
critic = rlQValueFunction(criticNetwork,obsInfo,actInfo);
criticOptimizer = rlOptimizer(criticOptions);
%ActorNetwork
actorNetwork = [
featureInputLayer(numObs,'Normalization','none','Name','observation')
fullyConnectedLayer(128,'Name','ActorFC1')
reluLayer('Name','ActorRelu1')
fullyConnectedLayer(200,'Name','ActorFC2')
reluLayer('Name','ActorRelu2')
fullyConnectedLayer(1,'Name','ActorFC3')
tanhLayer('Name','ActorTanh1')
scalingLayer('Name','ActorScaling','Scale',max(actInfo.UpperLimit))];
actorNetwork = dlnetwork(actorNetwork);
% Create the actor function approximator.
actorOptions = rlOptimizerOptions('LearnRate',5e-04,'GradientThreshold',1);
actor = rlContinuousDeterministicActor(actorNetwork,obsInfo,actInfo);
actorOptimizer = rlOptimizer(actorOptions);
policy = rlDeterministicActorPolicy(actor);
agentOptions = rlDDPGAgentOptions(...
'SampleTime',Ts,...
'ActorOptimizerOptions',actorOptions,...
'CriticOptimizerOptions',criticOptions,...
'ExperienceBufferLength',1e6,...
'MiniBatchSize',128);
agentOptions.NoiseOptions.Variance = 0.4;
agentOptions.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor,critic,agentOptions);
%%
%creat buffer
myBuffer.bufferSize = 500;
myBuffer.bufferIndex = 0;
myBuffer.currentBufferLength = 0;
myBuffer.observation = zeros(numObs,1,myBuffer.bufferSize);
myBuffer. nextObservation =zeros(numObs,1,myBuffer.bufferSize);
myBuffer.action = zeros(numAct,1,myBuffer.bufferSize);
myBuffer.reward = zeros(1,myBuffer.bufferSize);
myBuffer.isDone = zeros(1,myBuffer.bufferSize);
%processExpData structure
processExpData.Critic = critic;
processExpData.TargetCritic = critic;
processExpData.Actor = actor;
processExpData.TargetActor = actor;
processExpData.MyBuffer = myBuffer;
processExpData.CriticOptimizer = criticOptimizer;
processExpData.ActorOptimizer = actorOptimizer;
processExpData.MiniBatchSize = 128;
processExpData.DiscountFactor = 0.99;
processExpData.TargetSmoothFactor = 1e-3;
maxEpisodes = 1000;
maxSteps = ceil(Tf/Ts);
trainingTerminationValue = 480;
[trainingPlot,lineReward,lineAveReward] = hBuildFigure;%图像显示
% Enable the training visualization plot.
set(trainingPlot,'Visible','on');
%%
%train
doTraining = true;
if doTraining
% Training loop
for i = 1:maxEpisodes
% update actor, critic
agent = setActor(agent,actor);
agent = setCritic(agent,critic);
out=sim(agent, env);
myBuffer.observations=out.Observation.observations.Data(:,:,1:myBuffer.bufferSize-1);
myBuffer. nextObservation=out.Observation.observations.Data(:,:,2:myBuffer.bufferSize);
myBuffer.action =out.Action.force.Data;
myBuffer.reward = out.Reward.Data'; %转置
myBuffer.isDone = out.IsDone.Data';
%miniBatch
BatchSize.observations=myBuffer.observations(:,:,1:processExpData.MiniBatchSize);
BatchSize.nextObservation=myBuffer.nextObservation(:,:,1:processExpData.MiniBatchSize);
BatchSize.action=myBuffer.action(:,:,1:processExpData.MiniBatchSize);
BatchSize.reward = myBuffer.reward(:,1:processExpData.MiniBatchSize);
BatchSize.isDone = myBuffer.isDone(:,1:processExpData.MiniBatchSize);
BatchSize.nextObs{1}=BatchSize.nextObservation;
BatchSize.obs{1}=BatchSize.observations;
for epoch=1:maxSteps
if ~isempty(BatchSize)
% Update network parameters using the mini-batch.
[processExpData,actorParams] = learnFcn(processExpData,BatchSize);
% Update the policy parameters using the actor parameters.
policy = setLearnableParameters(policy,actorParams);
end
end
% Extract the critic and actor networks from processExpData.
critic = processExpData.Critic;
actor = processExpData.Actor;
% Extract the cumulative reward and calculate average reward
% per step for this episode.
episodeCumulativeReward = sum(BatchSize.reward);
episodeCumulativeRewardVector = cat(2,...
episodeCumulativeRewardVector,episodeCumulativeReward);
movingAveReward = movmean(episodeCumulativeRewardVector,...
aveWindowSize,2);
addpoints(lineReward,episodeCt,episodeCumulativeReward);
addpoints(lineAveReward,episodeCt,movingAveReward(end));
drawnow;
if max(movingAveReward) > trainingTerminationValue
break
end
end
end
% %plot env
% obs = reset(env);
% plot(env);
% for maxStepsPerEpisode = 1:maxStepsPerEpisode
%
% % Select action according to trained policy
% action = getAction(Actor,{obs});
%
% % Step the environment
% [nextObs,reward,isdone] = step(env,action{1});
%
% % Check for terminal condition
% if isdone
% break
% end
%
% obs = nextObs;
%
% end
%%
function [processExpData,actorParams] = learnFcn(processExpData,BatchSize)
% Find the terminal experiences.
doneidx = (BatchSize.isDone == 1);
% Compute target next actions against the next observations.
nextAction = evaluate(processExpData.TargetActor,BatchSize.nextObs);%数据类型要变为cell
% compute qtarget = reward + gamma*Q(nextObservation,nextAction)
% = reward + gamma*expectedFutureReturn
targetq = BatchSize.reward;
% Bootstrap the target at nonterminal experiences.
expectedFutureReturn = ...
getValue(processExpData.TargetCritic,BatchSize.nextObs,nextAction);
targetq(~doneidx) = targetq(~doneidx) + ...
processExpData.DiscountFactor.*expectedFutureReturn(~doneidx);
% Compute critic gradient using deepCriticLoss function.
criticGradient = gradient(processExpData.Critic,@deepCriticLoss,...
[BatchSize.obs,BatchSize.action],targetq);
% Update the critic parameters.
[processExpData.Critic,processExpData.CriticOptimizer] = update(...
processExpData.CriticOptimizer,processExpData.Critic,...
criticGradient);
% Compute the actor gradient using the deepActorGradient function. To
% accelerate the deepActorGradient function, the critic network is
% extracted outside the function and is passed in as a field to the
% actorGradData input struct.
actorGradData.CriticNet = getModel(processExpData.Critic);
actorGradData.MiniBatchSize = processExpData.MiniBatchSize;
actorGradient = customGradient(processExpData.Actor,@deepActorGradient,...
BatchSize.obs,actorGradData);
% Update the actor parameters.
[processExpData.Actor,processExpData.ActorOptimizer] = update(...
processExpData.ActorOptimizer,processExpData.Actor,...
actorGradient);
actorParams = getLearnableParameters(processExpData.Actor);
% Update targets using the given TargetSmoothFactor hyperparameter.
processExpData.TargetCritic = syncParameters(processExpData.TargetCritic,...
processExpData.Critic,processExpData.TargetSmoothFactor);
processExpData.TargetActor = syncParameters(processExpData.TargetActor ,...
processExpData.Actor ,processExpData.TargetSmoothFactor);
end
function loss = deepCriticLoss(q,targetq)
q = q{1};
% Loss is the half mean-square error of q = Q(observation,action)
%against qtarget
loss = mse(q,reshape(targetq,size(q)));
end
function dQdTheta = deepActorGradient(actorNet,observation,gradData)
% Evaluate actions from current observations.
action = forward(actorNet,observation{:});
% Compute: q = Q(s,a)
q = predict(gradData.CriticNet,observation{:},action);
% Compute: qsum = -sum(q)/N to maximize q
qsum = -sum(q,"all")/gradData.MiniBatchSize;
% Compute: d(-sum(q)/N)/dActorParams
dQdTheta = dlgradient(qsum,actorNet.Learnables);
end
function [trainingPlot, lineReward, lineAveReward] = hBuildFigure()
plotRatio = 16/9;
trainingPlot = figure(...
'Visible','off',...
'HandleVisibility','off', ...
'NumberTitle','off',...
'Name','Cart Pole Custom Training');
trainingPlot.Position(3) = plotRatio * trainingPlot.Position(4);
ax = gca(trainingPlot);
lineReward = animatedline(ax);
lineAveReward = animatedline(ax,'Color','r','LineWidth',3);
xlabel(ax,'Episode');
ylabel(ax,'Reward');
legend(ax,'Cumulative Reward','Average Reward','Location','northwest')
title(ax,'Training Progress');
end
Operation terminated by user during deep.internal.recording.convert.tapeToFunction
In deep.AcceleratedFunction>iGenerateBackwardFunctionNoCleanup (line 637)
[backwardFun, backwardFileName] = deep.internal.recording.convert.tapeToFunction(tape, backwardInputIDs, gradIDs);
In deep.AcceleratedFunction>iGenerateBackwardFunction (line 603)
[backwardFun, backwardFileName] = iGenerateBackwardFunctionNoCleanup(args,numIntermediateAdjointsToDrop);
In deep.AcceleratedFunction/augmentWithBackwardFunctions (line 467)
[fullBackwardFun, fullBackwardFileName] = iGenerateBackwardFunction(args, 0);
In deep.AcceleratedFunction/generateForward (line 442)
fun = augmentWithBackwardFunctions(obj, args, numIntermediates, generatedCode);
In () (line 262)
[cacheData, varargout, illegalOutputs] = generateForward(obj, varargout, inputNodes, tm, priorTapeCount, isTracing);
In nnet.internal.cnn.layer.CodegenFusedLayer/evaluate (line 153)
[Z{1:nout}] = trainingFun(X, this.Learnables, this.State);
In nnet.internal.cnn.layer.CodegenFusedLayer/predict (line 75)
[varargout{1:nargout}] = evaluate(this, X, @predictPropagate, this.PredictTrainingFcn, this.PredictInferenceCache);
In nnet.internal.cnn.layer.GraphExecutor>iPredictWithoutState (line 407)
out = predict(layer, in);

Answers (0)

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!