clear all
rng(0)
mdl = 'rlSimplePendulumModel';
open_system(mdl)
env = rlPredefinedEnv('SimplePendulumModel-Continuous');
set_param('rlSimplePendulumModel/create observations','ThetaObservationHandling','sincos');
obsInfo = getObservationInfo(env);
numObs = obsInfo.Dimension(1);
actInfo = getActionInfo(env);
numAct = actInfo.Dimension(1);
env.ResetFcn = @(in)setVariable(in,'theta0',pi,'Workspace',mdl);
Ts = 0.05;
Tf = 20;
criticLayerSizes = [200 100];
actorLayerSizes = [200 100];
criticNetwork = [imageInputLayer([numObs 1 1],'Normalization','none','Name','observations')
fullyConnectedLayer(criticLayerSizes(1),'Name','CriticFC1')
reluLayer('Name','CriticRelu1')
fullyConnectedLayer(criticLayerSizes(2),'Name','CriticFC2')
reluLayer('Name','CriticRelu2')
fullyConnectedLayer(1,'Name','CriticOutput')
];
criticOpts = rlRepresentationOptions('LearnRate',1e-3);
critic = rlValueRepresentation(criticNetwork,env.getObservationInfo, ...
'Observation',{'observations'},criticOpts);
inPath = [ imageInputLayer([numObs 1 1], 'Normalization','none','Name','observations')
fullyConnectedLayer(numAct,'Name','infc') ];
meanPath = [ tanhLayer('Name','tanh');
scalingLayer('Name','scale','Scale',actInfo.UpperLimit) ];
sdevPath = softplusLayer('Name', 'splus');
outLayer = concatenationLayer(3,2,'Name','mean&sdev');
net = layerGraph(inPath);
net = addLayers(net,meanPath);
net = addLayers(net,sdevPath);
net = addLayers(net,outLayer);
net = connectLayers(net,'infc','tanh/in');
net = connectLayers(net,'infc','splus/in');
net = connectLayers(net,'scale','mean&sdev/in1');
net = connectLayers(net,'splus','mean&sdev/in2');
actorOptions = rlRepresentationOptions('LearnRate',1e-3);
Actor = rlStochasticActorRepresentation(net,obsInfo,actInfo,...
'Observation',{'observations'}, actorOptions);
opt = rlPPOAgentOptions('ExperienceHorizon',512,...
'ClipFactor',0.2,...
'EntropyLossWeight',0.02,...
'MiniBatchSize',64,...
'NumEpoch',3,...
'AdvantageEstimateMethod','gae',...
'GAEFactor',0.95,...
'SampleTime',0.05,...
'DiscountFactor',0.9995);
agent = rlPPOAgent(actor,critic,opt);
maxepisodes = 5000;
maxsteps = ceil(Tf/Ts);
trainOpts = rlTrainingOptions(...
'MaxEpisodes',maxepisodes,...
'MaxStepsPerEpisode',maxsteps,...
'ScoreAveragingWindowLength',5,...
'Verbose',false,...
'Plots','training-progress',...
'StopTrainingCriteria','AverageReward',...
'StopTrainingValue',-740,...
'SaveAgentCriteria','EpisodeReward',...
'SaveAgentValue',-740);
trainingStats = train(agent,env,trainOpts);
simOptions = rlSimulationOptions('MaxSteps',500);
experience = sim(env,agent,simOptions);