How to implement Emission with MSL - objective-c

I have been studying the Metal API (newbie here) and working with the examples and documentation to create a 3D animation in my iOS app. I know I can use SceneKit but for the moment I want to understand and learn how Metal API works.
One of the examples from Apple implements a decent number of textures mapping for the object, but the emission texture is missing. After a bit of study I came with a solution like this:
float3 computeDiffuse(LightingParameters parameters)
{
float3 diffuseRawValue = float3(((1.0/PI) * parameters.baseColor) * (1.0 - parameters.metalness));
return diffuseRawValue * parameters.nDotl * parameters.ambientOcclusion;
}
float3 computeSpecular(LightingParameters parameters)
{
float specularRoughness = parameters.roughness * (1.0 - parameters.metalness) + parameters.metalness;
float Ds = Distribution(parameters.nDoth, specularRoughness);
float3 Cspec0 = float3(1.0f);
float3 Fs = float3(mix(float3(Cspec0), float3(1), Fresnel(parameters.hDotl)));
float alphaG = sqr(specularRoughness * 0.5 + 0.5);
float Gs = Geometry(parameters.nDotl, alphaG) * Geometry(parameters.nDotv, alphaG);
float3 specularOutput = (Ds * Gs * Fs * parameters.irradiatedColor)
* (1.0 + parameters.metalness * float3(parameters.baseColor))
+ float3(parameters.metalness)
* parameters.irradiatedColor
* float3(parameters.baseColor);
return specularOutput * parameters.ambientOcclusion;
}
float3 computeEmissive(LightingParameters parameters)
{
float3 diffuseRawValue = float3(((1.0/PI) * parameters.emissiveColor));
return diffuseRawValue;
}
//...
fragment float4
fragmentLighting(ColorInOut in [[stage_in]],
constant AAPLUniforms & uniforms [[ buffer(AAPLBufferIndexUniforms) ]],
constant AAPLMaterialUniforms & materialUniforms [[ buffer(AAPLBufferIndexMaterialUniforms) ]],
texture2d<float> baseColorMap [[ texture(AAPLTextureIndexBaseColor), function_constant(has_base_color_map) ]],
texture2d<float> normalMap [[ texture(AAPLTextureIndexNormal), function_constant(has_normal_map) ]],
texture2d<float> metallicMap [[ texture(AAPLTextureIndexMetallic), function_constant(has_metallic_map) ]],
texture2d<float> roughnessMap [[ texture(AAPLTextureIndexRoughness), function_constant(has_roughness_map) ]],
texture2d<float> ambientOcclusionMap [[ texture(AAPLTextureIndexAmbientOcclusion), function_constant(has_ambient_occlusion_map) ]],
texture2d<float> emissiveMap [[ texture(AAPLTextureIndexEmissive), function_constant(has_emissive_map) ]],
texturecube<float> irradianceMap [[ texture(AAPLTextureIndexIrradianceMap), function_constant(has_irradiance_map)]])
{
float4 final_color = float4(0);
LightingParameters parameters = calculateParameters(in,
uniforms,
materialUniforms,
baseColorMap,
normalMap,
metallicMap,
roughnessMap,
ambientOcclusionMap,
emissiveMap,
irradianceMap);
final_color = float4(computeSpecular(parameters) + computeDiffuse(parameters) + computeEmissive(parameters), 1.0f);
return final_color;
}
This code above makes the object get slightly white, which is not the desired effect.
Note the line computeSpecular(parameters) + computeDiffuse(parameters) + computeEmissive(parameters), in the original source code computeEmissive(parameters) does not exist, that's something that I have implemented trying to make the color more intense, probably this is totally wrong and what I have should be doing is to change the contrast, brightness and luminance, but I am not sure how to do this.
I am probably doing everything wrong, I am open to any suggestion. :)
Thank you.

Related

how to draw a helix in 3d using fragment shader (shadertoy)

I am relatily new to GLSL.
I want to create a solar system model and use it as a wallpaper (using shadertoy) (Something like this and while i have the planets moving correctly i cant figure out how to do the helix paths that follow those planets.
Here is my code so far
uniform vec2 iResolution;
uniform float iTime;
#define pi 3.141592653589
float circ(vec2 uv, vec2 pos, float rad, float blur) {
return smoothstep(blur, 0., length(-uv + pos)-rad); //draws a circle to the screen
}
float line(vec2 uv, vec3 start, vec3 end, float width) {
vec2 p = uv - start.xy;
vec2 d = end.xy - start.xy;
float l = length(d);
d = normalize(d); //direction
float t = clamp(dot(p, d), 0., l);
return (length(p - d*t)) < width ? 1 : 0.;
}
float helix(vec2 uv, vec3 start, vec3 direction, float width, float length, float angle) {
float delta = iTime / angle;
vec2 p = uv - start.xy;
vec2 d = (normalize(direction) * length).xy;
float l = length(d);
d /= l;
float t = clamp(dot(p, d), 0., l);
return (length(p - d*t)) < width ? 1 : 0.;
}
vec3 rotate(vec3 point, vec3 angle) {
mat3 rot = mat3(
cos(angle.y)*cos(angle.z), cos(angle.z)*sin(angle.x)*sin(angle.y)-cos(angle.x)*sin(angle.z), cos(angle.x)*cos(angle.z)*sin(angle.y)+sin(angle.x)*sin(angle.z),
cos(angle.y)*sin(angle.z), cos(angle.x)*cos(angle.z)+sin(angle.x)*sin(angle.y)*sin(angle.z), -cos(angle.z)*sin(angle.x)+cos(angle.x)*sin(angle.y)*sin(angle.z),
-sin(angle.y), cos(angle.y)*sin(angle.x), cos(angle.x)*cos(angle.y));
return rot * point;
}
void main() {
vec2 uv = fragCoord / iResolution.xy;
float ratio = iResolution.x / iResolution.y;
uv -= .5; //center origin
uv.x = uv.x * ratio;//make screen square
uv /= .3;//zoom
float planetA[5] = float[](0., iTime / 0.241, iTime / 0.6152, iTime, iTime / 1.8809);
vec3 planets[5] = vec3[](
vec3(0.), // sun
vec3(cos(planetA[1]) * .4, sin(planetA[1]) * .4, 0.), // mercury
vec3(cos(planetA[2]) * .7, sin(planetA[2]) * .7, 0.), // venus
vec3(cos(planetA[3]), sin(planetA[3]), 0.), // earth
vec3(cos(planetA[4])*1.5, sin(planetA[4])*1.5, 0.)// mars
);
vec3 planetsC[5] = vec3[](
vec3(0.89, 0.9, 0.45), // sun
vec3(0.54, 0.57, 0.63), // mercury
vec3(0.9, 0.5, 0.2), // venus
vec3(0.2, 0.3, 0.8), // earth
vec3(0.8, 0.3, 0.2)// mars
);
vec3 rotVec = vec3(-pi/4, pi/4, 0.);
fragColor = vec4(0.);
fragColor = mix(fragColor, vec4(1.), line(uv, vec3(0.), rotate(vec3(0., 0., 2.), rotVec), 0.01)); //sun trail
for (int i = 1; i < planets.length(); i++) {
planets[i] = rotate(planets[i], vec3(-pi/4., pi/4., 0.)); //rotate the planet
fragColor = mix(fragColor, vec4(planetsC[i], 1.), helix(uv, planets[i], rotate(vec3(0., 0., 2.), rotVec), 0.01, 2., planetA[i])); //planet trail
}
for (int i = 0; i < planets.length(); i++) { //draws the planets
fragColor = mix(fragColor, vec4(planetsC[i], 1.), circ(uv, planets[i].xy, 0.05, 0.01));
}
}
the helix function is currently only a modified version of the line method but i want it to curve around the suns trail.
Any advice and/or help would be appreciated as i am still learing.
I have tried to convert the helix equation:
x = r * cos(t) y = r * sin(t) z = t but havent gotten it to work
heres the method currently, although it only displays a straigt line:
float helix(vec2 uv, vec3 start, vec3 direction, float width, float length, float angle) {
float delta = iTime / angle;
vec2 p = uv - start.xy;
vec2 d = (normalize(direction) * length).xy;
float l = length(d);
d /= l;
float t = clamp(dot(p, d), 0., l);
return (length(p - d*t)) < width ? 1 : 0.;
}

On improving the performance of element-wise multiplication of an array with a scalar in Rust

First of all, I'm new to Rust so excuse any "not so great" implementations.
I'm building a game where I need to draw a texture by changing values in a rgba vector that later gets turned into an image (it's for a raycaster game if you're wondering). Inside said function, I mutiply the current pixel with a floating point value called shade that darkens or brightens the pixel depending on the lighting. Since this gets called a lot of times and the profiler flags it as a hotspot, I'm trying to make it as fast as possible, given the current implementation is not the best.
pub fn draw_texture(
&mut self,
texture: &Vec<u8>,
texture_position: [usize; 2],
pixel_position: [usize; 2],
width_rect: usize,
shade: f32,
length: usize,
) {
let pos = (texture_position[1] * length + texture_position[0]) << 2; // Position of current pixel
(0..width_rect).for_each(|i| { // Draws in rectangles of size 1 x width_rect
let mut pixel: [u8; 4] = texture[pos..pos + 4].try_into().unwrap(); //rgba pixel
if shade != 1.0 { //Draws shade depending on current lighting, darkening or brightening the pixel
pixel[0] = (pixel[0] as f32 * shade) as u8;
pixel[1] = (pixel[1] as f32 * shade) as u8;
pixel[2] = (pixel[2] as f32 * shade) as u8;
}
if pixel[3] == 255 {
//Doesn't draw transparent pixels
self.draw_pixel([pixel_position[0] + i, pixel_position[1]], &pixel);
}
});
}
pub fn draw_pixel(&mut self, position: [usize; 2], pixel: &[u8; 4]) {
let i = position[1] * self.width + position[0];
if (i << 2) + 4 < self.img_arr_len {
self.img_arr[(i << 2)..(i << 2) + 4].copy_from_slice(pixel);
}
}

How to do batching without UBOs?

I'm trying to implement batching for a WebGL renderer which is struggling with lots of small objects due to too many draw calls. What I thought is I'd batch them all by the kind of shader they use, then draw a few at a time, uploading material parameters and the model matrix for each object once in uniforms.
My problem is that the uniform size limits for non-UBO uniforms are extremely low, as in 256 floats low at a minimum. If my material uses, say, 8 floats, and if you factor in the model matrix, I barely have enough uniforms to draw 10 models in a single batch, which isn't really going to be enough.
Is there any hope to make this work without UBOs? Are textures an option? How are people doing batching without WebGL2 UBOs?
More details: I have no skinning or complex animations, I just have some shaders (diffuse, cook-torrance, whatever) and each model has different material settings for each shader, e.g. color, roughness, index of refraction which can be changed dynamically by the user (so it's not realistic to bake them into the vertex array because we have some high poly data, also users can switch shaders and not all shaders have the same number of parameters) as well as material maps obviously. The geometry itself is static and just has a linear transform on each model. For the most part all meshes are different so geometry instancing won't help a whole lot, but I can look at that later.
Thanks
I don't know that this is actually faster than lots of draw calls but here is drawing 4 models with a single draw call
It works by adding an id per model. So, for every vertex in model #0 put a 0, for every vertex in model #1 put a 1, etc.
Then it uses model id to index stuff in a texture. The easiest would be model id chooses the row of a texture and then all the data for that model can be pulled out of that row.
For WebGL1
attribute float modelId;
...
#define TEXTURE_WIDTH ??
#define COLOR_OFFSET ((0.0 + 0.5) / TEXTURE_WIDTH)
#define MATERIAL_OFFSET ((1.0 + 0.5) / TEXTURE_WIDTH)
float modelOffset = (modelId + .5) / textureHeight;
vec4 color = texture2D(perModelData, vec2(COLOR_OFFSET, modelOffset));
vec4 roughnessIndexOfRefaction = texture2D(perModelData,
vec2(MATERIAL_OFFSET, modelOffset));
etc..
As long as you are not drawing more than gl.getParameter(gl.MAX_TEXTURE_SIZE) models it will work. If you have more than that either use more draw calls or change the texture coordinate calculations so there's more than one model per row
In WebGL2 you'd change the code to use texelFetch and unsigned integers
in uint modelId;
...
#define COLOR_OFFSET 0
#define MATERIAL_OFFSET 1
vec4 color = texelFetch(perModelData, uvec2(COLOR_OFFSET, modelId));
vec4 roughnessIndexOfRefaction = texelFetch(perModelData,
uvec2(MATERIAL_OFFSET, modelId));
example of 4 models drawn with 1 draw call. For each model the model matrix and color are stored in the texture.
const m4 = twgl.m4;
const v3 = twgl.v3;
const gl = document.querySelector('canvas').getContext('webgl');
const ext = gl.getExtension('OES_texture_float');
if (!ext) {
alert('need OES_texture_float');
}
const COMMON_STUFF = `
#define TEXTURE_WIDTH 5.0
#define MATRIX_ROW_0_OFFSET ((0. + 0.5) / TEXTURE_WIDTH)
#define MATRIX_ROW_1_OFFSET ((1. + 0.5) / TEXTURE_WIDTH)
#define MATRIX_ROW_2_OFFSET ((2. + 0.5) / TEXTURE_WIDTH)
#define MATRIX_ROW_3_OFFSET ((3. + 0.5) / TEXTURE_WIDTH)
#define COLOR_OFFSET ((4. + 0.5) / TEXTURE_WIDTH)
`;
const vs = `
attribute vec4 position;
attribute vec3 normal;
attribute float modelId;
uniform float textureHeight;
uniform sampler2D perModelDataTexture;
uniform mat4 projection;
uniform mat4 view;
varying vec3 v_normal;
varying float v_modelId;
${COMMON_STUFF}
void main() {
v_modelId = modelId; // pass to fragment shader
float modelOffset = (modelId + 0.5) / textureHeight;
// note: in WebGL2 better to use texelFetch
mat4 model = mat4(
texture2D(perModelDataTexture, vec2(MATRIX_ROW_0_OFFSET, modelOffset)),
texture2D(perModelDataTexture, vec2(MATRIX_ROW_1_OFFSET, modelOffset)),
texture2D(perModelDataTexture, vec2(MATRIX_ROW_2_OFFSET, modelOffset)),
texture2D(perModelDataTexture, vec2(MATRIX_ROW_3_OFFSET, modelOffset)));
gl_Position = projection * view * model * position;
v_normal = mat3(view) * mat3(model) * normal;
}
`;
const fs = `
precision highp float;
varying vec3 v_normal;
varying float v_modelId;
uniform float textureHeight;
uniform sampler2D perModelDataTexture;
uniform vec3 lightDirection;
${COMMON_STUFF}
void main() {
float modelOffset = (v_modelId + 0.5) / textureHeight;
vec4 color = texture2D(perModelDataTexture, vec2(COLOR_OFFSET, modelOffset));
float l = dot(lightDirection, normalize(v_normal)) * .5 + .5;
gl_FragColor = vec4(color.rgb * l, color.a);
}
`;
// compile shader, link, look up locations
const programInfo = twgl.createProgramInfo(gl, [vs, fs]);
// make some vertex data
const modelVerts = [
twgl.primitives.createSphereVertices(1, 6, 4),
twgl.primitives.createCubeVertices(1, 1, 1),
twgl.primitives.createCylinderVertices(1, 1, 10, 1),
twgl.primitives.createTorusVertices(1, .2, 16, 8),
];
// merge all the vertices into one
const arrays = twgl.primitives.concatVertices(modelVerts);
// fill an array so each vertex of each model has a modelId
const modelIds = new Uint16Array(arrays.position.length / 3);
let offset = 0;
modelVerts.forEach((verts, modelId) => {
const end = offset + verts.position.length / 3;
while(offset < end) {
modelIds[offset++] = modelId;
}
});
arrays.modelId = { numComponents: 1, data: modelIds };
// calls gl.createBuffer, gl.bindBuffer, gl.bufferData
const bufferInfo = twgl.createBufferInfoFromArrays(gl, arrays);
const numModels = modelVerts.length;
const tex = gl.createTexture();
const textureWidth = 5; // 4x4 matrix, 4x1 color
gl.bindTexture(gl.TEXTURE_2D, tex);
gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA, textureWidth, numModels, 0, gl.RGBA, gl.FLOAT, null);
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
// this data is for the texture, one row per model
// first 4 pixels are the model matrix, 5 pixel is the color
const perModelData = new Float32Array(textureWidth * numModels * 4);
const stride = textureWidth * 4;
const modelOffset = 0;
const colorOffset = 16;
// set the colors at init time
for (let modelId = 0; modelId < numModels; ++modelId) {
perModelData.set([r(), r(), r(), 1], modelId * stride + colorOffset);
}
function r() {
return Math.random();
}
function render(time) {
time *= 0.001; // seconds
twgl.resizeCanvasToDisplaySize(gl.canvas);
gl.viewport(0, 0, gl.canvas.width, gl.canvas.height);
gl.enable(gl.DEPTH_TEST);
gl.enable(gl.CULL_FACE);
const fov = Math.PI * 0.25;
const aspect = gl.canvas.clientWidth / gl.canvas.clientHeight;
const near = 0.1;
const far = 20;
const projection = m4.perspective(fov, aspect, near, far);
const eye = [0, 0, 10];
const target = [0, 0, 0];
const up = [0, 1, 0];
const camera = m4.lookAt(eye, target, up);
const view = m4.inverse(camera);
// set the matrix for each model in the texture data
const mat = m4.identity();
for (let modelId = 0; modelId < numModels; ++modelId) {
const t = time * (modelId + 1) * 0.3;
m4.identity(mat);
m4.rotateX(mat, t, mat);
m4.rotateY(mat, t, mat);
m4.translate(mat, [0, 0, Math.sin(t * 1.1) * 4], mat);
m4.rotateZ(mat, t, mat);
perModelData.set(mat, modelId * stride + modelOffset);
}
// upload the texture data
gl.bindTexture(gl.TEXTURE_2D, tex);
gl.texSubImage2D(gl.TEXTURE_2D, 0, 0, 0, textureWidth, numModels,
gl.RGBA, gl.FLOAT, perModelData);
gl.useProgram(programInfo.program);
// calls gl.bindBuffer, gl.enableVertexAttribArray, gl.vertexAttribPointer
twgl.setBuffersAndAttributes(gl, programInfo, bufferInfo);
// calls gl.activeTexture, gl.bindTexture, gl.uniformXXX
twgl.setUniforms(programInfo, {
lightDirection: v3.normalize([1, 2, 3]),
perModelDataTexture: tex,
textureHeight: numModels,
projection,
view,
});
// calls gl.drawArrays or gl.drawElements
twgl.drawBufferInfo(gl, bufferInfo);
requestAnimationFrame(render);
}
requestAnimationFrame(render);
body { margin: 0; }
canvas { width: 100vw; height: 100vh; display: block; }
<script src="https://twgljs.org/dist/4.x/twgl-full.min.js"></script>
<canvas></canvas>
Here's 2000 models in one draw call
https://jsfiddle.net/greggman/g2tcadho/

GLSL variables not storing?

I am learning GLSL through Unity and I recently came across a problem involving the storing of variables.
Shader "Shader" {
Properties{
_Hole_Position("hole_Position", Vector) = (0., 0., 0., 1.0)
_Hole_EventHorizonDistance("hole_EventHorizonDistance", Float) = 1.0
_DebugValue("debugValue", Float) = 0.0
}
SubShader{
Pass{
GLSLPROGRAM
uniform mat4 _Object2World;
//Variables
varying float debugValue;
varying vec4 pos;
varying vec4 hole_Position;
varying float hole_EventHorizonDistance = 1;
#ifdef VERTEX
void main()
{
pos = _Object2World * gl_Vertex;
gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;
}
#endif
#ifdef FRAGMENT
void main()
{
float dist = distance(vec4(pos.x, 0.0,pos.z, 1.0), vec4(hole_Position.x, 0.0, hole_Position.z, 1.0));
debugValue = dist;
if (dist < hole_EventHorizonDistance)
{
gl_FragColor = vec4(0.3, 0.3, 0.3, 1.0);
}
else
{
gl_FragColor = vec4(0.4, 0.6, 1.0, 1.0);
}
//gl_FragColor = vec4(hole_EventHorizonDistance, 0, 0, 1.0);
}
#endif
ENDGLSL
}
}
}
Now Hole_Position and EventHorizonDistance are changed from an outside C#-script with:
g.GetComponent<Renderer>().sharedMaterial.SetVector("_Hole_Position", new Vector4(transform.position.x, transform.position.y, transform.position.z, 1));
g.GetComponent<Renderer>().sharedMaterial.SetFloat("_Hole_EventHorizonDistance", 2);
this does not work as I intend it too (by changing the fragments color if its position is within 2 units from Hole_Position. However debugging with:
gl_FragColor = vec4(hole_EventHorizonDistance, 0, 0, 1.0);
seemingly suggests that EventHorizon is 0 at all times (the mesh tested on remains completely black), however debugging by getting and printing the variable from an outside (via
print(g.GetComponent<Renderer>().sharedMaterial.GetFloat("_Hole_EventHorizonDistance"));
) tells me EventHorizonDistance = 2. I cannot wrap my head around why this is the case, why is it so?

How to manually or automatically optimize HLSL (pixel) shader code?

What are successful strategies to optimize HLSL shader code in terms of computational complexity (meaning: minimizing runtime of the shader)?
I guess one way would be to minimize the number of arithmetic operations that result from compiling the shader.
How could this be done a) manually and b) using automated tools (if existing) ?
Collection of manual techniques (Updated)
Avoid branching (But how to do that best?)
Whenever possible: precompute outside shader and pass as argument.
An example code would be:
float2 DisplacementScroll;
// Parameter that limit the water effect
float glowHeight;
float limitTop;
float limitTopWater;
float limitLeft;
float limitRight;
float limitBottom;
sampler TextureSampler : register(s0); // Original color
sampler DisplacementSampler : register(s1); // Displacement
float fadeoutWidth = 0.05;
// External rumble displacement
int enableRumble;
float displacementX;
float displacementY;
float screenZoom;
float4 main(float4 color : COLOR0, float2 texCoord : TEXCOORD0) : COLOR0
{
// Calculate minimal distance to next border
float dx = min(texCoord.x - limitLeft, limitRight - texCoord.x);
float dy = min(texCoord.y - limitTop, limitBottom - texCoord.y);
///////////////////////////////////////////////////////////////////////////////////////
// RUMBLE //////////////////////
///////////////////////////////////////////////////////////////////////////////////////
if (enableRumble!=0)
{
// Limit rumble strength by distance to HLSL-active region (think map)
// The factor of 100 is chosen by hand and controls slope with which dimfactor goes to 1
float dimfactor = clamp(100.0f * min(dx, dy), 0, 1); // Maximum is 1.0 (do not amplify)
// Shift texture coordinates by rumble
texCoord.x += displacementX * dimfactor * screenZoom;
texCoord.y += displacementY * dimfactor * screenZoom;
}
//////////////////////////////////////////////////////////////////////////////////////////
// Water refraction (optical distortion) and water like-color tint //////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
if (dx >= 0)
{
float dyWater = min(texCoord.y - limitTopWater, limitBottom - texCoord.y);
if (dyWater >= 0)
{
// Look up the amount of displacement from texture
float2 displacement = tex2D(DisplacementSampler, DisplacementScroll + texCoord / 3);
float finalFactor = min(dx,dyWater) / fadeoutWidth;
if (finalFactor > 1) finalFactor = 1;
// Apply displacement by water refraction
texCoord.x += (displacement.x * 0.2 - 0.15) * finalFactor * 0.15 * screenZoom; // Why these strange numbers ?
texCoord.y += (displacement.y * 0.2 - 0.15) * finalFactor * 0.15 * screenZoom;
// Look up the texture color of the original underwater pixel.
color = tex2D(TextureSampler, texCoord);
// Additional color transformation (blue shift)
color.r = color.r - 0.1f;
color.g = color.g - 0.1f;
color.b = color.b + 0.3f;
}
else if (dyWater > -glowHeight)
{
// No water distortion...
color = tex2D(TextureSampler, texCoord);
// Scales from 0 (upper glow limit) ... 1 (near water surface)
float glowFactor = 1 - (dyWater / -glowHeight);
// ... but bluish glow
// Additional color transformation
color.r = color.r - (glowFactor * 0.1); // 24 = 1/(30f/720f); // Prelim: depends on screen resolution, must fit to value in HLSL Update
color.g = color.g - (glowFactor * 0.1);
color.b = color.b + (glowFactor * 0.3);
}
else
{
// Return original color (no water distortion above and below)
color = tex2D(TextureSampler, texCoord);
}
}
else
{
// Return original color (no water distortion left or right)
color = tex2D(TextureSampler, texCoord);
}
return color;
}
technique Refraction
{
pass Pass0
{
PixelShader = compile ps_2_0 main();
}
}
I'm not very familar with the HLSL internals, but from what I've learned from GLSL is: never branch something. It probably will execute both parts and then decide which result of them should be valid.
Also have a look at this
and this.
As far as I know there are no automatic tools except the compiler itself. For very low level optimization you can use fxc with the /Fc parameter to get the assembly listing. The possible assembly instructions are listed here. One low level optimization which is worth mentioning is MAD: multiply and add. This may not be optimized to a MAD operation (I'm not sure, just try it out yourself):
a *= b;
a += c;
but this should be optimized to a MAD:
a = (a * b) + c;
You can optimize your code using mathematical techniques that involve manipulation functions, would be something like:
// Shift texture coordinates by rumble
texCoord.x += displacementX * dimfactor * screenZoom;
texCoord.y += displacementY * dimfactor * screenZoom;
Here you multiply three values​​, but only one of them comes from a register of the GPU, the other two are constants, you could pre multiply and store in a global constant.
// Shift texture coordinates by rumble
texCoord.x += dimfactor * pre_zoom_dispx; // displacementX * screenZoom
texCoord.y += dimfactor * pre_zoom_dispy; // displacementY * screenZoom
Another example:
// Apply displacement by water refraction
texCoord.x += (displacement.x * 0.2 - 0.15) * finalFactor * 0.15 * screenZoom; // Why these strange numbers ?
texCoord.y += (displacement.y * 0.2 - 0.15) * finalFactor * 0.15 * screenZoom;
0.15 * screenZoom <- can be optimized by one global.
The HLSL Compiler of Visual Studio 2012 have a option in poperties to enable Optimizations. But the best optimization that you can make is write the HLSL code simple as possible and using the Intrinsic functions http://msdn.microsoft.com/en-us/library/windows/desktop/ff471376(v=vs.85).aspx
Those functions are like memcpy of C, using assembly code in body that uses resources of system like 128-bit registers (yes, CPU have 128-bit registers http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) and strongly fast operations.