GLSL optimizer for shaders on GLSL-only devices

has anyone seen https://github.com/aras-p/glsl-optimizer ? it’s a library that optimizes GLSL shaders. there’s a good introduction here: http://aras-p.info/blog/2010/09/29/glsl-optimizer/

you can test it out in a browser here: http://zz85.github.io/glsl-optimizer/

because of the way the RA GLSL shaders have the vertex and fragment shader combined into the one file, if you want to convert one you have to do the bit in between the #if defined(VERTEX) and the #elif defined(FRAGMENT) as a vertex, and the bit after the #elif defined(FRAGMENT) as…a fragment., and then paste the two results into the same #if structure.

for example, i tried it out on https://github.com/RetroPie/common-shaders/blob/rpi/shaders/crt-hyllian-sharpness-hack.glsl - i picked one at random that performed poorly on the pi. i guess this is the automatic GLSL conversion of https://github.com/libretro/common-shaders/blob/4eabef3da13032f096222ac8c6b894ed23992da1/crt/shaders/crt-hyllian.cg with the sharpness hack enabled, but it might have some additional tweaks made to get it working on a pi.

the optimised shader i got:

// GLSL shader autogenerated by cg2glsl.py.
#if defined(VERTEX)


vec4 _r0008;
attribute vec4 VertexCoord;
attribute vec4 TexCoord;
varying vec4 TEX0;
uniform mat4 MVPMatrix;
void main ()
{
  _r0008 = (VertexCoord.x * MVPMatrix[0]);
  _r0008 = (_r0008 + (VertexCoord.y * MVPMatrix[1]));
  _r0008 = (_r0008 + (VertexCoord.z * MVPMatrix[2]));
  _r0008 = (_r0008 + (VertexCoord.w * MVPMatrix[3]));
  gl_Position = _r0008;
  TEX0.xy = TexCoord.xy;
}


#elif defined(FRAGMENT)


precision highp float;
uniform sampler2D Texture;
mediump vec2 _c0025;
mediump vec2 _c0029;
mediump vec2 _c0031;
mediump vec4 _r0033;
lowp vec3 _r0043;
mediump vec4 _r0045;
lowp vec3 _r0055;
mediump vec4 _r0057;
lowp vec3 _r0067;
lowp vec3 _TMP76;
varying vec4 TEX0;
uniform mediump vec2 OutputSize;
uniform mediump vec2 TextureSize;
uniform mediump vec2 InputSize;
void main ()
{
  mediump vec3 _d_1;
  mediump float _dz_2;
  lowp vec3 _color_3;
  mediump vec2 _tc_4;
  mediump vec2 _pix_coord_5;
  mediump vec2 tmpvar_6;
  tmpvar_6.x = (2.0 * TextureSize.x);
  tmpvar_6.y = TextureSize.y;
  mediump vec2 tmpvar_7;
  tmpvar_7.y = 0.0;
  tmpvar_7.x = (1.0/(tmpvar_6.x));
  _pix_coord_5 = ((TEX0.xy * tmpvar_6) - vec2(0.5, 0.0));
  _tc_4 = ((floor(_pix_coord_5) + vec2(0.5, 0.5)) / tmpvar_6);
  mediump vec2 tmpvar_8;
  tmpvar_8 = fract(_pix_coord_5);
  _c0025 = (_tc_4 - tmpvar_7);
  lowp vec4 tmpvar_9;
  tmpvar_9 = texture2D (Texture, _c0025);
  lowp vec4 tmpvar_10;
  tmpvar_10 = texture2D (Texture, _tc_4);
  _c0029 = (_tc_4 + tmpvar_7);
  lowp vec4 tmpvar_11;
  tmpvar_11 = texture2D (Texture, _c0029);
  _c0031 = (_tc_4 + (2.0 * tmpvar_7));
  lowp vec4 tmpvar_12;
  tmpvar_12 = texture2D (Texture, _c0031);
  mediump vec4 tmpvar_13;
  tmpvar_13.w = 1.0;
  tmpvar_13.x = ((tmpvar_8.x * tmpvar_8.x) * tmpvar_8.x);
  tmpvar_13.y = (tmpvar_8.x * tmpvar_8.x);
  tmpvar_13.z = tmpvar_8.x;
  _r0033.x = dot (vec4(-0.5, 1.0, -0.5, 0.0), tmpvar_13);
  _r0033.y = dot (vec4(1.5, -2.5, 0.0, 1.0), tmpvar_13);
  _r0033.z = dot (vec4(-1.5, 2.0, 0.5, 0.0), tmpvar_13);
  _r0033.w = dot (vec4(0.5, -0.5, 0.0, 0.0), tmpvar_13);
  _r0043 = (_r0033.x * tmpvar_9.xyz);
  _r0043 = (_r0043 + (_r0033.y * tmpvar_10.xyz));
  _r0043 = (_r0043 + (_r0033.z * tmpvar_11.xyz));
  _r0043 = (_r0043 + (_r0033.w * tmpvar_12.xyz));
  _dz_2 = ((0.9 * InputSize.x) / OutputSize.x);
  mediump vec2 tmpvar_14;
  tmpvar_14.x = (tmpvar_8.x + _dz_2);
  tmpvar_14.y = tmpvar_8.y;
  mediump vec2 tmpvar_15;
  tmpvar_15.x = (tmpvar_8.x - _dz_2);
  tmpvar_15.y = tmpvar_8.y;
  mediump vec4 tmpvar_16;
  tmpvar_16.w = 1.0;
  tmpvar_16.x = ((tmpvar_14.x * tmpvar_14.x) * tmpvar_14.x);
  tmpvar_16.y = (tmpvar_14.x * tmpvar_14.x);
  tmpvar_16.z = tmpvar_14.x;
  _r0045.x = dot (vec4(-0.5, 1.0, -0.5, 0.0), tmpvar_16);
  _r0045.y = dot (vec4(1.5, -2.5, 0.0, 1.0), tmpvar_16);
  _r0045.z = dot (vec4(-1.5, 2.0, 0.5, 0.0), tmpvar_16);
  _r0045.w = dot (vec4(0.5, -0.5, 0.0, 0.0), tmpvar_16);
  _r0055 = (_r0045.x * tmpvar_9.xyz);
  _r0055 = (_r0055 + (_r0045.y * tmpvar_10.xyz));
  _r0055 = (_r0055 + (_r0045.z * tmpvar_11.xyz));
  _r0055 = (_r0055 + (_r0045.w * tmpvar_12.xyz));
  mediump vec4 tmpvar_17;
  tmpvar_17.w = 1.0;
  tmpvar_17.x = ((tmpvar_15.x * tmpvar_15.x) * tmpvar_15.x);
  tmpvar_17.y = (tmpvar_15.x * tmpvar_15.x);
  tmpvar_17.z = tmpvar_15.x;
  _r0057.x = dot (vec4(-0.5, 1.0, -0.5, 0.0), tmpvar_17);
  _r0057.y = dot (vec4(1.5, -2.5, 0.0, 1.0), tmpvar_17);
  _r0057.z = dot (vec4(-1.5, 2.0, 0.5, 0.0), tmpvar_17);
  _r0057.w = dot (vec4(0.5, -0.5, 0.0, 0.0), tmpvar_17);
  _r0067 = (_r0057.x * tmpvar_9.xyz);
  _r0067 = (_r0067 + (_r0057.y * tmpvar_10.xyz));
  _r0067 = (_r0067 + (_r0057.z * tmpvar_11.xyz));
  _r0067 = (_r0067 + (_r0057.w * tmpvar_12.xyz));
  lowp vec3 tmpvar_18;
  tmpvar_18 = max (_r0043, max (_r0055, _r0067));
  _color_3.xy = tmpvar_18.xy;
  _TMP76 = sqrt((vec3(0.1, 0.1, 0.1) + (tmpvar_18 * vec3(0.3, 0.3, 0.3))));
  mediump vec3 tmpvar_19;
  tmpvar_19 = min (max ((1.0 - 
    min (max ((abs(
      (tmpvar_8.y - 0.5)
    ) / _TMP76), 0.0), 1.0)
  ), 0.0), 1.0);
  _d_1 = ((tmpvar_19 * tmpvar_19) * (vec3(3.0, 3.0, 3.0) - (2.0 * tmpvar_19)));
  _d_1 = ((0.9 * (_d_1 - 1.0)) + 1.0);
  _color_3.z = (tmpvar_18.z * 1.05);
  mediump vec3 tmpvar_20;
  tmpvar_20 = min (max ((_color_3 * _d_1), 0.0), 1.0);
  _color_3 = (tmpvar_20 * 1.2);
  lowp vec4 tmpvar_21;
  tmpvar_21.w = 1.0;
  tmpvar_21.xyz = _color_3;
  gl_FragColor = tmpvar_21;
}


#endif

as you can see, it’s pretty unreadable, but you wouldn’t need to look at it :slight_smile:

my very scientific test was to run sonic 1 at 1080p on a pi3 for 1 minute (minus a few seconds for retropie’s launch script):

old:

RetroArch [INFO] :: Average audio buffer saturation: 65.61 %, standard deviation (percentage points): 17.81 %.RetroArch [INFO] :: Amount of time spent close to underrun: 0.30 %. Close to blocking: 30.72 %.
RetroArch [INFO] :: Threaded video stats: Frames pushed: 1999, Frames dropped: 1300.
RetroArch [INFO] :: Monitor FPS estimation is disabled for threaded video.
RetroArch [INFO] :: Removing temporary content file: /home/pi/RetroPie/roms/megadrive/Sonic the Hedgehog (JUE) [!].bin.
RetroArch [INFO] :: Average audio buffer saturation: 65.61 %, standard deviation (percentage points): 17.81 %.
RetroArch [INFO] :: Amount of time spent close to underrun: 0.30 %. Close to blocking: 30.72 %.
RetroArch [INFO] :: Monitor FPS estimation is disabled for threaded video.

after optimisation:

RetroArch [INFO] :: Average audio buffer saturation: 70.31 %, standard deviation (percentage points): 18.25 %.RetroArch [INFO] :: Amount of time spent close to underrun: 0.48 %. Close to blocking: 44.44 %.
RetroArch [INFO] :: Threaded video stats: Frames pushed: 3311, Frames dropped: 7.
RetroArch [INFO] :: Monitor FPS estimation is disabled for threaded video.
RetroArch [INFO] :: Removing temporary content file: /home/pi/RetroPie/roms/megadrive/Sonic the Hedgehog (JUE) [!].bin.
RetroArch [INFO] :: Average audio buffer saturation: 70.31 %, standard deviation (percentage points): 18.25 %.
RetroArch [INFO] :: Amount of time spent close to underrun: 0.48 %. Close to blocking: 44.44 %.
RetroArch [INFO] :: Monitor FPS estimation is disabled for threaded video.

quite a difference in the frames pushed! and the image was the same as far as i could tell. i think this could really help these low-power devices, or anything else that can’t run cg.

here’s an example of it implemented in the GlideN64 plugin: https://github.com/loganmc10/GLideN64/commit/9bc63a821d8ba06071f5e81104d44cf243f3d195

i haven’t attempted to implement it in RA, but does it sound like it could be useful?

To be included directly in RA, it’d need to be backported to plain ol’ C but getting it into appropriate cores could be good. As for the shaders, yeah, that looks like a great improvement. Feel free to submit any cleaned-up versions to https://github.com/hizzlekizzle/glsl-shaders and they’ll supersede the normal machine-converted ones.

I think the problem with converting the existing ones is that it will remove all the parameters, or any user-editable constants in the .glsl file. I think it should work if we do the optimisation at runtime, on the shader just before it’s ultimately delivered to the GPU. at that point any parameters have been applied i think, so you’re safe to optimize away. i guess around here: https://github.com/libretro/RetroArch/blob/883d9ff6b5bb09dec935221af379300a813a4d63/gfx/drivers_shader/shader_glsl.c#L288

a C++ to C conversion is beyond me but i’ll see if I can get a proof of concept up and running with the existing library :slight_smile:

oh lawd, I just peeked at the source and I doubt anyone would be willing to port it to C. Oh well, we’ll see how your PoC goes :slight_smile:

i think even integrating it as a C++ library into the existing C program is beyond my skills at the moment. not getting anywhere! might be something to come back to when i’m more familiar with C/C++. i think it could be slot in quite nicely even if it’s just used with a C wrapper, and definitely give some big performance boosts (presumably with the XMB also).

i had a go at this on my fork: https://github.com/dankcushions/RetroArch/tree/glsloptimizer

it doesn’t work yet… i haven’t figured out how to link C++ libraries with plain C. there’s a wrapper of sorts, but i’m sure the linking stuff is done all wrong. even when that’s done i’m guessing at the params you need to pass in (eg the gl context is surely wrong) so that will need changing.

i may get back to it at some point, but my hope is that someone who knows what they’re doing takes the reigns :slight_smile: it can’t be that far away.