commit 375b39502dc0d8561872d65519d2e7644974457c
Author: Leszek Koltunski <leszek@koltunski.pl>
Date:   Wed May 2 00:19:41 2018 +0100

    OIT: something starts working ('Blur' and 'Multiblur' work, 'Triblur' and 'Transparency' do not)

diff --git a/src/main/java/org/distorted/library/main/DistortedEffects.java b/src/main/java/org/distorted/library/main/DistortedEffects.java
index ed71714..263928b 100644
--- a/src/main/java/org/distorted/library/main/DistortedEffects.java
+++ b/src/main/java/org/distorted/library/main/DistortedEffects.java
@@ -70,7 +70,12 @@ public class DistortedEffects
 
   /// BLIT DEPTH PROGRAM ///
   private static DistortedProgram mBlitDepthProgram;
+  private static int mBlitDepthTextureH;
+  private static int mBlitDepthDepthTextureH;
+  private static int mBlitDepthDepthH;
+  private static int mBlitDepthTexCorrH;
   private static int mBlitDepthSizeH;
+  private static int mBlitDepthNumRecordsH;
 
   private static int[] mLinkedListSSBO = new int[1];
   private static int[] mAtomicCounter = new int[1];
@@ -83,13 +88,11 @@ public class DistortedEffects
 
   private static int mBufferSize=(0x1<<23);  // 8 million entries
 
-  private static IntBuffer mIntBuffer;
-
-private static ByteBuffer mBuf, mAtomicBuf;
-private static IntBuffer mIntBuf, mAtomicIntBuf;
-
   /// BLIT DEPTH RENDER PROGRAM ///
   private static DistortedProgram mBlitDepthRenderProgram;
+  private static int mBlitDepthRenderDepthTextureH;
+  private static int mBlitDepthRenderDepthH;
+  private static int mBlitDepthRenderTexCorrH;
   private static int mBlitDepthRenderSizeH;
 
   /// NORMAL PROGRAM /////
@@ -184,10 +187,12 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
       }
 
     int blitDepthProgramH   = mBlitDepthProgram.getProgramHandle();
+    mBlitDepthTextureH      = GLES31.glGetUniformLocation( blitDepthProgramH, "u_Texture");
+    mBlitDepthDepthTextureH = GLES31.glGetUniformLocation( blitDepthProgramH, "u_DepthTexture");
+    mBlitDepthDepthH        = GLES31.glGetUniformLocation( blitDepthProgramH, "u_Depth");
+    mBlitDepthTexCorrH      = GLES31.glGetUniformLocation( blitDepthProgramH, "u_TexCorr");
     mBlitDepthSizeH         = GLES31.glGetUniformLocation( blitDepthProgramH, "u_Size");
-
-    mIntBuffer = ByteBuffer.allocateDirect(4).order(ByteOrder.nativeOrder()).asIntBuffer();
-    mIntBuffer.put(0,0);
+    mBlitDepthNumRecordsH   = GLES31.glGetUniformLocation( blitDepthProgramH, "u_numRecords");
 
     if( mLinkedListSSBO[0]<0 )
       {
@@ -203,7 +208,7 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
       GLES31.glGenBuffers(1,mAtomicCounter,0);
       GLES31.glBindBufferBase(GLES31.GL_ATOMIC_COUNTER_BUFFER, 0, mAtomicCounter[0]);
       GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, mAtomicCounter[0] );
-      GLES31.glBufferData(GLES31.GL_ATOMIC_COUNTER_BUFFER, 4, mIntBuffer, GLES31.GL_DYNAMIC_DRAW);
+      GLES31.glBufferData(GLES31.GL_ATOMIC_COUNTER_BUFFER, 4, null, GLES31.GL_DYNAMIC_DRAW);
       GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, 0);
       }
 
@@ -222,6 +227,9 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
       }
 
     int blitDepthRenderProgramH   = mBlitDepthRenderProgram.getProgramHandle();
+    mBlitDepthRenderDepthTextureH = GLES31.glGetUniformLocation( blitDepthRenderProgramH, "u_DepthTexture");
+    mBlitDepthRenderDepthH        = GLES31.glGetUniformLocation( blitDepthRenderProgramH, "u_Depth");
+    mBlitDepthRenderTexCorrH      = GLES31.glGetUniformLocation( blitDepthRenderProgramH, "u_TexCorr");
     mBlitDepthRenderSizeH         = GLES31.glGetUniformLocation( blitDepthRenderProgramH, "u_Size");
 
     // NORMAL PROGRAM //////////////////////////////////////
@@ -377,6 +385,33 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
     GLES31.glDrawArrays(GLES31.GL_TRIANGLE_STRIP, 0, 4);
     }
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// reset atomic counter to 0
+
+  static void zeroOutAtomic()
+    {
+    GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, mAtomicCounter[0] );
+
+    ByteBuffer atomicBuf = (ByteBuffer)GLES31.glMapBufferRange( GLES31.GL_ATOMIC_COUNTER_BUFFER, 0, 4,
+                                                                GLES31.GL_MAP_READ_BIT|GLES31.GL_MAP_WRITE_BIT);
+    if( atomicBuf!=null )
+      {
+      IntBuffer atomicIntBuf = atomicBuf.order(ByteOrder.nativeOrder()).asIntBuffer();
+
+      int counter = atomicIntBuf.get(0);
+      atomicIntBuf.put(0, 0);
+      //android.util.Log.e("counter", "now = "+counter+" w="+surface.mWidth+" h="+surface.mHeight
+      //                             +" diff="+(counter-surface.mWidth*surface.mHeight));
+      }
+    else
+      {
+      android.util.Log.e("counter", "failed to map buffer");
+      }
+
+    GLES31.glUnmapBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER);
+    GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, 0);
+    }
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
   static void blitDepthPriv(DistortedOutputSurface surface, float corrW, float corrH)
@@ -384,7 +419,12 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
     mBlitDepthProgram.useProgram();
 
     GLES31.glViewport(0, 0, surface.mWidth, surface.mHeight );
+    GLES31.glUniform1i(mBlitDepthTextureH, 0);
+    GLES31.glUniform1i(mBlitDepthDepthTextureH, 1);
+    GLES31.glUniform2f(mBlitDepthTexCorrH, corrW, corrH );
     GLES31.glUniform2f(mBlitDepthSizeH, surface.mWidth, surface.mHeight);
+    GLES31.glUniform1ui(mBlitDepthNumRecordsH, (mBufferSize-surface.mWidth*surface.mHeight)/3 );  // see the fragment shader
+    GLES31.glUniform1f(mBlitDepthDepthH , 1.0f-surface.mNear);
     GLES31.glVertexAttribPointer(mBlitDepthProgram.mAttribute[0], 2, GLES31.GL_FLOAT, false, 0, mQuadPositions);
     GLES31.glDrawArrays(GLES31.GL_TRIANGLE_STRIP, 0, 4);
     }
@@ -400,31 +440,12 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
     //analyzeBuffer(surface.mWidth, surface.mHeight);
 
     GLES31.glViewport(0, 0, surface.mWidth, surface.mHeight );
+    GLES31.glUniform1i(mBlitDepthRenderDepthTextureH, 1);
+    GLES31.glUniform2f(mBlitDepthRenderTexCorrH, corrW, corrH );
     GLES31.glUniform2f(mBlitDepthRenderSizeH, surface.mWidth, surface.mHeight);
+    GLES31.glUniform1f( mBlitDepthRenderDepthH , 1.0f-surface.mNear);
     GLES31.glVertexAttribPointer(mBlitDepthRenderProgram.mAttribute[0], 2, GLES31.GL_FLOAT, false, 0, mQuadPositions);
     GLES31.glDrawArrays(GLES31.GL_TRIANGLE_STRIP, 0, 4);
-
-    // reset atomic counter to 0
-    GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, mAtomicCounter[0] );
-
-    mAtomicBuf = (ByteBuffer)GLES31.glMapBufferRange( GLES31.GL_ATOMIC_COUNTER_BUFFER, 0, 4,
-                                                      GLES31.GL_MAP_READ_BIT|GLES31.GL_MAP_WRITE_BIT);
-    if( mAtomicBuf!=null )
-      {
-      mAtomicIntBuf = mAtomicBuf.order(ByteOrder.nativeOrder()).asIntBuffer();
-
-      int counter = mAtomicIntBuf.get(0);
-      mAtomicIntBuf.put(0, 0);
-      //android.util.Log.e("counter", "now = "+counter+" w="+surface.mWidth+" h="+surface.mHeight
-      //                             +" diff="+(counter-surface.mWidth*surface.mHeight));
-      }
-    else
-      {
-      android.util.Log.e("counter", "failed to map buffer");
-      }
-
-    GLES31.glUnmapBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER);
-    GLES31.glBindBuffer(GLES31.GL_ATOMIC_COUNTER_BUFFER, 0);
     }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -435,20 +456,20 @@ private static IntBuffer mIntBuf, mAtomicIntBuf;
     int errors = 0;
 
     GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, mLinkedListSSBO[0]);
-    mBuf = (ByteBuffer)GLES31.glMapBufferRange(GLES31.GL_SHADER_STORAGE_BUFFER, 0, mBufferSize*4, GLES31.GL_MAP_READ_BIT);
-    mIntBuf = mBuf.order(ByteOrder.nativeOrder()).asIntBuffer();
+    ByteBuffer buf = (ByteBuffer)GLES31.glMapBufferRange(GLES31.GL_SHADER_STORAGE_BUFFER, 0, mBufferSize*4, GLES31.GL_MAP_READ_BIT);
+    IntBuffer intBuf = buf.order(ByteOrder.nativeOrder()).asIntBuffer();
 
     for(int col=0; col<w; col++)
       for(int row=0; row<h; row++)
         {
         index = col+row*w;
-        ptr = mIntBuf.get(index);
+        ptr = intBuf.get(index);
 
         if( ptr!=0 )
           {
           if( ptr>0 && ptr<mBufferSize )
             {
-            ptr = mIntBuf.get(ptr);
+            ptr = intBuf.get(ptr);
             if( ptr != index )
               {
               android.util.Log.d("surface", "col="+col+" row="+row+" val="+ptr+" expected: "+index);
diff --git a/src/main/java/org/distorted/library/main/DistortedOutputSurface.java b/src/main/java/org/distorted/library/main/DistortedOutputSurface.java
index 71d7d30..df694b7 100644
--- a/src/main/java/org/distorted/library/main/DistortedOutputSurface.java
+++ b/src/main/java/org/distorted/library/main/DistortedOutputSurface.java
@@ -353,7 +353,11 @@ public static final int DEBUG_FPS = 1;
 
         if( lastBucket!=currBucket )
           {
-          if( lastBucket!=0 )
+          if( lastBucket==0 )
+            {
+            DistortedEffects.zeroOutAtomic();
+            }
+          else
             {
             for(int j=bucketChange; j<i; j++)
               {
@@ -386,10 +390,8 @@ public static final int DEBUG_FPS = 1;
           numRenders += currQueue.postprocess(mBuffer);
           numRenders += blitWithDepth(time, mBuffer[quality]);
           GLES31.glMemoryBarrier(GLES31.GL_ALL_BARRIER_BITS);
-          //GLES31.glFinish();
           numRenders += blitWithDepthRender(time,mBuffer[quality]);  // merge the OIT linked list
           clearBuffer(mBuffer[quality]);
-          //GLES31.glFinish();
           }
         }
 
diff --git a/src/main/res/raw/blit_depth_fragment_shader.glsl b/src/main/res/raw/blit_depth_fragment_shader.glsl
index 7654ad6..7d10844 100644
--- a/src/main/res/raw/blit_depth_fragment_shader.glsl
+++ b/src/main/res/raw/blit_depth_fragment_shader.glsl
@@ -22,26 +22,105 @@ precision highp int;
 
 out vec4 fragColor;
 in vec2 v_TexCoordinate;
+in vec2 v_Pixel;              // location of the current fragment, in pixels
+
+uniform sampler2D u_Texture;
+uniform sampler2D u_DepthTexture;
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+// per-pixel linked list. Order Independent Transparency.
 
 uniform vec2 u_Size;
+uniform uint u_numRecords;
 
 layout (binding=0, offset=0) uniform atomic_uint u_Counter;
 
-layout (std430,binding=1) buffer linkedlist
+layout (std430,binding=1) buffer linkedlist  // first (u_Size.x*u_Size.y) uints - head pointers,
+  {                                          // one for each pixel in the Output rectangle.
+  uint u_Records[];                          //
+  };                                         // Next 3*u_numRecords uints - actual linked list, i.e.
+                                             // triplets of (pointer,depth,rgba).
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+// Concurrent insert to a linked list. Tim Harris, 'pragmatic implementation of non-blocking
+// linked-lists', 2001.
+// This arranges fragments by decreasing 'depth', so one would think - from back to front, but
+// in main() below the depth is mapped with S*(1-depth)/2, so it is really front to back.
+
+void insert( vec2 ij, uint depth, uint rgba )
+  {
+  uint ptr = atomicCounterIncrement(u_Counter);
+/*
+  if( ptr<u_numRecords )
+    {
+    ptr = 3u*ptr + uint(u_Size.x*u_Size.y);
+
+	u_Records[ptr   ] = 0u;
+    u_Records[ptr+1u] = depth;
+    u_Records[ptr+2u] = rgba;//(255u<<16u) + (255u);//rgba;
+
+    uint index = uint(ij.x + ij.y * u_Size.x);
+
+    u_Records[index] = ptr;
+    discard;
+    }
+*/
+  if( ptr<u_numRecords )
+    {
+    ptr = 3u*ptr + uint(u_Size.x*u_Size.y);
+
+    u_Records[ptr+1u] = depth;
+    u_Records[ptr+2u] = rgba;
+
+    memoryBarrier();
+
+    uint prev = uint(ij.x + ij.y * u_Size.x);
+    uint curr = u_Records[prev];
+
+    while (true)
+      {
+      if ( curr==0u || depth > u_Records[curr+1u] )  // need to insert here
+        {
+        u_Records[ptr] = curr;     // next of new record is curr
+        memoryBarrier();
+        uint res = atomicCompSwap( u_Records[prev], curr, ptr );
+
+        if (res==curr) break;      // done!
+        else           curr = res; // could not insert! retry from same place in list
+        }
+      else                         // advance in list
+        {
+        prev = curr;
+        curr = u_Records[prev];
+        }
+      }
+
+    discard;
+    }
+  }
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+
+uint convert(vec4 c)
   {
-  uint u_Records[];
-  };
+  return ((uint(255.0*c.r))<<24u) + ((uint(255.0*c.g))<<16u) + ((uint(255.0*c.b))<<8u) + uint(255.0*c.a);
+  }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
 
 void main()                    		
   {
-  uint pixelX = uint(v_TexCoordinate.x * u_Size.x);
-  uint pixelY = uint(v_TexCoordinate.y * u_Size.y);
-  uint index  = pixelX + pixelY * uint(u_Size.x);
-
-  uint ptr = uint(u_Size.x*u_Size.y) + atomicCounterIncrement(u_Counter);
-  u_Records[ptr  ] = index;
-  u_Records[index] = ptr;
-  discard;
+  vec4 frag  = texture(u_Texture     , v_TexCoordinate);
+  float depth= texture(u_DepthTexture, v_TexCoordinate).r;
+
+  if( frag.a > 0.95 )
+    {
+    gl_FragDepth = depth;
+    fragColor    = frag;
+    }
+  else if( frag.a > 0.0 )
+    {
+    const float S= 2147483647.0; // max signed int. Could probably be max unsigned int but this is enough.
+    insert(v_Pixel, uint(S*(1.0-depth)/2.0), convert(frag) );
+    }
   }
\ No newline at end of file
diff --git a/src/main/res/raw/blit_depth_render_fragment_shader.glsl b/src/main/res/raw/blit_depth_render_fragment_shader.glsl
index bfcb0d0..cfcc064 100644
--- a/src/main/res/raw/blit_depth_render_fragment_shader.glsl
+++ b/src/main/res/raw/blit_depth_render_fragment_shader.glsl
@@ -20,28 +20,56 @@
 precision highp float;
 precision highp int;
 
-out vec4 fragColor;
-in vec2 v_TexCoordinate;
+out vec4 fragColor;           // The output color
+in vec2 v_TexCoordinate;      // Interpolated texture coordinate per fragment.
+in vec2 v_Pixel;              // location of the current fragment, in pixels
+
+uniform sampler2D u_DepthTexture;
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+// per-pixel linked list. Order Independent Transparency.
 
 uniform vec2 u_Size;
 
-layout (std430,binding=1) buffer linkedlist
+layout (std430,binding=1) buffer linkedlist  // first (u_Size.x*u_Size.y) uints - head pointers,
+  {                                          // one for each pixel in the Output rectangle.
+  uint u_Records[];                          //
+  };                                         // Next 3*u_numRecords uints - actual linked list, i.e.
+                                             // triplets of (pointer,depth,rgba).
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+
+vec4 convert(uint rgba)
   {
-  uint u_Records[];
-  };
+  return vec4( float((rgba>>24u)&255u),float((rgba>>16u)&255u),float((rgba>>8u)&255u),float(rgba&255u) ) / 255.0;
+  }
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+// https://en.wikipedia.org/wiki/Alpha_compositing (premultiplied)
+
+vec4 blend(vec4 clr,vec4 srf)
+  {
+  return clr + (1.0 - clr.a) * vec4(srf.rgb * srf.a , srf.a);
+  }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
 
 void main()                    		
   {
-  uint pixelX = uint(v_TexCoordinate.x * u_Size.x);
-  uint pixelY = uint(v_TexCoordinate.y * u_Size.y);
-  uint index  = pixelX + pixelY * uint(u_Size.x);
+  uint index = uint(v_Pixel.x + v_Pixel.y * u_Size.x);
+  uint curr = u_Records[index];
+
+  if (curr == 0u) discard;
+
+  vec4 color= vec4(0.0,0.0,0.0,0.0);
+  u_Records[index] = 0u;
 
-  uint ptr = u_Records[index];
-  uint color = u_Records[ptr];
-  //u_Records[ptr] = 0u;
+  while (curr > 0u)
+    {
+    color= blend( color, convert(u_Records[curr+2u]) );  // keep walking the linked list
+    curr = u_Records[curr];                              // and blending the colors in
+    }
 
-  if( color==index ) fragColor = vec4(0.0,1.0,0.0,1.0);
-  else               fragColor = vec4(1.0,0.0,0.0,1.0);
+  gl_FragDepth = texture(u_DepthTexture, v_TexCoordinate).r;
+  fragColor    = color;
   }
\ No newline at end of file
diff --git a/src/main/res/raw/blit_depth_vertex_shader.glsl b/src/main/res/raw/blit_depth_vertex_shader.glsl
index f5311ca..00f5196 100644
--- a/src/main/res/raw/blit_depth_vertex_shader.glsl
+++ b/src/main/res/raw/blit_depth_vertex_shader.glsl
@@ -20,13 +20,23 @@
 precision highp float;
 precision highp int;
 
-in vec2 a_Position;
-out vec2 v_TexCoordinate;
+in vec2 a_Position;           // Per-vertex position.
+out vec2 v_TexCoordinate;     //
+out vec2 v_Pixel;             //
+
+uniform float u_Depth;        // distance from the near plane to render plane, in clip coords
+uniform vec2  u_TexCorr;      // when we blit from postprocessing buffers, the buffers can be
+                              // larger than necessary (there is just one static set being
+                              // reused!) so we need to compensate here by adjusting the texture
+                              // coords.
+
+uniform vec2 u_Size;         // size of the output surface, in pixels.
 
 //////////////////////////////////////////////////////////////////////////////////////////////
 
 void main()
   {
-  v_TexCoordinate = a_Position + 0.5;
-  gl_Position     = vec4(2.0*a_Position,1.0,1.0);
+  v_TexCoordinate = (a_Position + 0.5) * u_TexCorr;
+  v_Pixel         = v_TexCoordinate * u_Size;
+  gl_Position     = vec4(2.0*a_Position,u_Depth,1.0);
   }
