The original engine is written for gl2+ class hardware it works and runs pretty well on that kind of hardware. But I figured if I removed this limitation I could come up with quite a bit simpler code.
The gl2+ code basically makes individual textures out of the texture sheet. I basically emulate the texture sheet CPU side. But texturing on the model 3 is weird. The way the database designer worked was a bit strange. A group of 4 textures might be addressed individually by a polygon, or a polygon might address a subset of the 4 textures. So for a single x/y position in the texture sheet a group you might have 5 different textures live there. If you imagine 4 textures together, and all the combinations. If it was 9 textures the possible combinations gets a lot larger.
Then there's the fact that each address is a 16bit value. We can either have 1 bit texture live there, or potentially have 4, 4 bit textures live there, at the same x/y position. On top of what scenario above.
The bottom line is this makes the texture binding and invalidation code pretty complex and potentially error prone.
With GL4+ h/w we can do unsigned integer maths. We can also use 16 bit unsigned textures. This means we can actually pass the model3 memory directly to opengl unmodified. Then we can do the texturing in the shader.
To create a 16bit unsigned texture we use this
- Code: Select all
glBindTexture(GL_TEXTURE_2D, m_textureBuffer);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, 2048, 2048, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
Then to update the texture memory
- Code: Select all
void CNew3D::UploadTextures(unsigned level, unsigned x, unsigned y, unsigned width, unsigned height)
{
glBindTexture(GL_TEXTURE_2D, m_textureBuffer);
glPixelStorei(GL_UNPACK_ALIGNMENT, 2);
for (unsigned i = 0; i < height; i++) {
glTexSubImage2D(GL_TEXTURE_2D, 0, x, y + i, width, 1, GL_RED_INTEGER, GL_UNSIGNED_SHORT, m_textureRAM + ((y + i) * 2048) + x);
}
Then in the shader.
- Code: Select all
vec4 texBiLinear(usampler2D texSampler, float level, ivec2 wrapMode, vec2 texSize, ivec2 texPos, vec2 texCoord)
{
float tx[2], ty[2];
float a = LinearTexLocations(wrapMode.s, texSize.x, texCoord.x, tx[0], tx[1]);
float b = LinearTexLocations(wrapMode.t, texSize.y, texCoord.y, ty[0], ty[1]);
vec4 p0q0 = ExtractColour(baseTexType,texelFetch(texSampler, ivec2(vec2(tx[0],ty[0]) * vec2(baseTexInfo.zw) + texPos), 0).r);
vec4 p1q0 = ExtractColour(baseTexType,texelFetch(texSampler, ivec2(vec2(tx[1],ty[0]) * vec2(baseTexInfo.zw) + texPos), 0).r);
vec4 p0q1 = ExtractColour(baseTexType,texelFetch(texSampler, ivec2(vec2(tx[0],ty[1]) * vec2(baseTexInfo.zw) + texPos), 0).r);
vec4 p1q1 = ExtractColour(baseTexType,texelFetch(texSampler, ivec2(vec2(tx[1],ty[1]) * vec2(baseTexInfo.zw) + texPos), 0).r);
- Code: Select all
vec4 ExtractColour(int type, uint value)
{
vec4 c = vec4(0.0);
if(type==0) { // T1RGB5
c.r = float((value >> 10) & 0x1F) / 31.0;
c.g = float((value >> 5 ) & 0x1F) / 31.0;
c.b = float((value ) & 0x1F) / 31.0;
c.a = 1.0 - float((value >> 15) & 0x1);
}
else if(type==1) { // Interleaved A4L4 (low byte)
c.rgb = vec3(float(value&0xFF) / 15.0);
c.a = float((value >> 4) & 0xF) / 15.0;
}
else if(type==2) {
c.a = float(value&0xFF) / 15.0;
c.rgb = vec3(float((value >> 4) & 0xF) / 15.0);
}
else if(type==3) {
c.rgb = vec3(float((value>>8)&0xFF) / 15.0);
c.a = float((value >> 12) & 0xF) / 15.0;
}
else if(type==4) {
c.a = float((value>>8)&0xFF) / 15.0;
c.rgb = vec3(float((value >> 12) & 0xF) / 15.0);
}
else if(type==5) {
c = vec4(float(value&0xFF) / 255.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
else if(type==6) {
c = vec4(float((value>>8)&0xFF) / 255.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
else if(type==7) { // RGBA4
c.r = float((value>>12)&0xF) / 15.0;
c.g = float((value>> 8)&0xF) / 15.0;
c.b = float((value>> 4)&0xF) / 15.0;
c.a = float((value>> 0)&0xF) / 15.0;
}
else if(type==8) { // low byte, low nibble
c = vec4(float(value&0xF) / 15.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
else if(type==9) { // low byte, high nibble
c = vec4(float((value>>4)&0xF) / 15.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
else if(type==10) { // high byte, low nibble
c = vec4(float((value>>8)&0xF) / 15.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
else if(type==11) { // high byte, high nibble
c = vec4(float((value>>12)&0xF) / 15.0);
if(c.a==1.0) { c.a = 0.0; }
else { c.a = 1.0; }
}
return c;
}
void GetPosition(int level, inout int x, inout int y, inout int width, inout int height)
{
const int mipXBase[] = { 0, 1024, 1536, 1792, 1920, 1984, 2016, 2032, 2040, 2044, 2046, 2047 };
const int mipYBase[] = { 0, 512, 768, 896, 960, 992, 1008, 1016, 1020, 1022, 1023 };
int mipDivisor = 1 << level;
x = mipXBase[level] + (x / mipDivisor);
y = mipYBase[level] + (y / mipDivisor);
width /= mipDivisor;
height /= mipDivisor;
}
void GetMicroTexturePos(int id, out int x, out int y)
{
int xCoords[8] = { 0, 0, 128, 128, 0, 0, 128, 128 };
int yCoords[8] = { 0, 128, 0, 128, 256, 384, 256, 384 };
x = xCoords[id];
y = yCoords[id];
}
int GetPage(int yCoord)
{
return yCoord / 1024;
}
int AddPage(int yCoord, int page)
{
return yCoord + (page*1024);
}
Code isn't quite finished. I haven't plugged in mipmapping yet, microtextures, or texture offsets but basically it works.
So what's the pros and cons?
Well the cons are, we need gl 4+ hardware to make this work which increases the system requirements.
The shader is LARGE. Actually so large I broke the string literal size limit in visual studio which is 65k bytes. I had to split the fragment shader into two.
The shader size and added complexity might be problematic for older/lower end h/w.
To make this work on mac we need a core context. This might not be too hard to do, and we could still use the legacy renderer with a non core context. Enabled with a switch perhaps.
Pro's
Much simpler code. Assuming reasonable gfx card lighter CPU side.
More efficient memory foot print. We'll only ever use 8meg of ram, since we use a direct copy of the real3d memory.
Weird corner cases such as an x/y offset being set and simply casting the memory as another type will work as expected, as well as texturing in illegal locations. Ie fighting vipers has some illegal textures which are written too far into the memory sheet if i recall.
All texture binds are gone, and replaced with just a uniform with the x/y location of the texture in the memory. This should theoretically be faster.