263 lines
9.9 KiB
Plaintext
263 lines
9.9 KiB
Plaintext
|
/*
|
||
|
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
|
||
|
*
|
||
|
* NOTICE TO USER:
|
||
|
*
|
||
|
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||
|
* international Copyright laws. Users and possessors of this source code
|
||
|
* are hereby granted a nonexclusive, royalty-free license to use this code
|
||
|
* in individual and commercial software.
|
||
|
*
|
||
|
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||
|
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||
|
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||
|
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||
|
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||
|
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||
|
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||
|
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||
|
*
|
||
|
* U.S. Government End Users. This source code is a "commercial item" as
|
||
|
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||
|
* "commercial computer software" and "commercial computer software
|
||
|
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||
|
* and is provided to the U.S. Government only as a commercial end item.
|
||
|
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||
|
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||
|
* source code with only those rights set forth herein.
|
||
|
*
|
||
|
* Any use of this source code in individual and commercial software must
|
||
|
* include, in the user documentation and internal comments to the code,
|
||
|
* the above Disclaimer and U.S. Government End Users Notice.
|
||
|
*/
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include "fluidsGL_kernels.h"
|
||
|
|
||
|
// Texture reference for reading velocity field
|
||
|
texture<float2, 2> texref;
|
||
|
static cudaArray *array = NULL;
|
||
|
|
||
|
void setupTexture(int x, int y) {
|
||
|
// Wrap mode appears to be the new default
|
||
|
texref.filterMode = cudaFilterModeLinear;
|
||
|
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float2>();
|
||
|
|
||
|
cudaMallocArray(&array, &desc, y, x);
|
||
|
cutilCheckMsg("cudaMalloc failed");
|
||
|
}
|
||
|
|
||
|
void bindTexture(void) {
|
||
|
cudaBindTextureToArray(texref, array);
|
||
|
cutilCheckMsg("cudaBindTexture failed");
|
||
|
}
|
||
|
|
||
|
void unbindTexture(void) {
|
||
|
cudaUnbindTexture(texref);
|
||
|
cutilCheckMsg("cudaUnbindTexture failed");
|
||
|
}
|
||
|
|
||
|
void updateTexture(cData *data, size_t wib, size_t h, size_t pitch) {
|
||
|
cudaMemcpy2DToArray(array, 0, 0, data, pitch, wib, h, cudaMemcpyDeviceToDevice);
|
||
|
cutilCheckMsg("cudaMemcpy failed");
|
||
|
}
|
||
|
|
||
|
void deleteTexture(void) {
|
||
|
cudaFreeArray(array);
|
||
|
cutilCheckMsg("cudaFreeArray failed");
|
||
|
}
|
||
|
|
||
|
// Note that these kernels are designed to work with arbitrary
|
||
|
// domain sizes, not just domains that are multiples of the tile
|
||
|
// size. Therefore, we have extra code that checks to make sure
|
||
|
// a given thread location falls within the domain boundaries in
|
||
|
// both X and Y. Also, the domain is covered by looping over
|
||
|
// multiple elements in the Y direction, while there is a one-to-one
|
||
|
// mapping between threads in X and the tile size in X.
|
||
|
// Nolan Goodnight 9/22/06
|
||
|
|
||
|
// This method adds constant force vectors to the velocity field
|
||
|
// stored in 'v' according to v(x,t+1) = v(x,t) + dt * f.
|
||
|
__global__ void
|
||
|
addForces_k(cData *v, int dx, int dy, int spx, int spy, float fx, float fy, int r, size_t pitch) {
|
||
|
|
||
|
int tx = threadIdx.x;
|
||
|
int ty = threadIdx.y;
|
||
|
cData *fj = (cData*)((char*)v + (ty + spy) * pitch) + tx + spx;
|
||
|
|
||
|
cData vterm = *fj;
|
||
|
tx -= r; ty -= r;
|
||
|
float s = 1.f / (1.f + tx*tx*tx*tx + ty*ty*ty*ty);
|
||
|
vterm.x += s * fx;
|
||
|
vterm.y += s * fy;
|
||
|
*fj = vterm;
|
||
|
}
|
||
|
|
||
|
// This method performs the velocity advection step, where we
|
||
|
// trace velocity vectors back in time to update each grid cell.
|
||
|
// That is, v(x,t+1) = v(p(x,-dt),t). Here we perform bilinear
|
||
|
// interpolation in the velocity space.
|
||
|
__global__ void
|
||
|
advectVelocity_k(cData *v, float *vx, float *vy,
|
||
|
int dx, int pdx, int dy, float dt, int lb) {
|
||
|
|
||
|
int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb;
|
||
|
int p;
|
||
|
|
||
|
cData vterm, ploc;
|
||
|
float vxterm, vyterm;
|
||
|
// gtidx is the domain location in x for this thread
|
||
|
if (gtidx < dx) {
|
||
|
for (p = 0; p < lb; p++) {
|
||
|
// fi is the domain location in y for this thread
|
||
|
int fi = gtidy + p;
|
||
|
if (fi < dy) {
|
||
|
int fj = fi * pdx + gtidx;
|
||
|
vterm = tex2D(texref, (float)gtidx, (float)fi);
|
||
|
ploc.x = (gtidx + 0.5f) - (dt * vterm.x * dx);
|
||
|
ploc.y = (fi + 0.5f) - (dt * vterm.y * dy);
|
||
|
vterm = tex2D(texref, ploc.x, ploc.y);
|
||
|
vxterm = vterm.x; vyterm = vterm.y;
|
||
|
vx[fj] = vxterm;
|
||
|
vy[fj] = vyterm;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// This method performs velocity diffusion and forces mass conservation
|
||
|
// in the frequency domain. The inputs 'vx' and 'vy' are complex-valued
|
||
|
// arrays holding the Fourier coefficients of the velocity field in
|
||
|
// X and Y. Diffusion in this space takes a simple form described as:
|
||
|
// v(k,t) = v(k,t) / (1 + visc * dt * k^2), where visc is the viscosity,
|
||
|
// and k is the wavenumber. The projection step forces the Fourier
|
||
|
// velocity vectors to be orthogonal to the vectors for each
|
||
|
// wavenumber: v(k,t) = v(k,t) - ((k dot v(k,t) * k) / k^2.
|
||
|
__global__ void
|
||
|
diffuseProject_k(cData *vx, cData *vy, int dx, int dy, float dt,
|
||
|
float visc, int lb) {
|
||
|
|
||
|
int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb;
|
||
|
int p;
|
||
|
|
||
|
cData xterm, yterm;
|
||
|
// gtidx is the domain location in x for this thread
|
||
|
if (gtidx < dx) {
|
||
|
for (p = 0; p < lb; p++) {
|
||
|
// fi is the domain location in y for this thread
|
||
|
int fi = gtidy + p;
|
||
|
if (fi < dy) {
|
||
|
int fj = fi * dx + gtidx;
|
||
|
xterm = vx[fj];
|
||
|
yterm = vy[fj];
|
||
|
|
||
|
// Compute the index of the wavenumber based on the
|
||
|
// data order produced by a standard NN FFT.
|
||
|
int iix = gtidx;
|
||
|
int iiy = (fi>dy/2)?(fi-(dy)):fi;
|
||
|
|
||
|
// Velocity diffusion
|
||
|
float kk = (float)(iix * iix + iiy * iiy); // k^2
|
||
|
float diff = 1.f / (1.f + visc * dt * kk);
|
||
|
xterm.x *= diff; xterm.y *= diff;
|
||
|
yterm.x *= diff; yterm.y *= diff;
|
||
|
|
||
|
// Velocity projection
|
||
|
if (kk > 0.f) {
|
||
|
float rkk = 1.f / kk;
|
||
|
// Real portion of velocity projection
|
||
|
float rkp = (iix * xterm.x + iiy * yterm.x);
|
||
|
// Imaginary portion of velocity projection
|
||
|
float ikp = (iix * xterm.y + iiy * yterm.y);
|
||
|
xterm.x -= rkk * rkp * iix;
|
||
|
xterm.y -= rkk * ikp * iix;
|
||
|
yterm.x -= rkk * rkp * iiy;
|
||
|
yterm.y -= rkk * ikp * iiy;
|
||
|
}
|
||
|
|
||
|
vx[fj] = xterm;
|
||
|
vy[fj] = yterm;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// This method updates the velocity field 'v' using the two complex
|
||
|
// arrays from the previous step: 'vx' and 'vy'. Here we scale the
|
||
|
// real components by 1/(dx*dy) to account for an unnormalized FFT.
|
||
|
__global__ void
|
||
|
updateVelocity_k(cData *v, float *vx, float *vy,
|
||
|
int dx, int pdx, int dy, int lb, size_t pitch) {
|
||
|
|
||
|
int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb;
|
||
|
int p;
|
||
|
|
||
|
float vxterm, vyterm;
|
||
|
cData nvterm;
|
||
|
// gtidx is the domain location in x for this thread
|
||
|
if (gtidx < dx) {
|
||
|
for (p = 0; p < lb; p++) {
|
||
|
// fi is the domain location in y for this thread
|
||
|
int fi = gtidy + p;
|
||
|
if (fi < dy) {
|
||
|
int fjr = fi * pdx + gtidx;
|
||
|
vxterm = vx[fjr];
|
||
|
vyterm = vy[fjr];
|
||
|
|
||
|
// Normalize the result of the inverse FFT
|
||
|
float scale = 1.f / (dx * dy);
|
||
|
nvterm.x = vxterm * scale;
|
||
|
nvterm.y = vyterm * scale;
|
||
|
|
||
|
cData *fj = (cData*)((char*)v + fi * pitch) + gtidx;
|
||
|
*fj = nvterm;
|
||
|
}
|
||
|
} // If this thread is inside the domain in Y
|
||
|
} // If this thread is inside the domain in X
|
||
|
}
|
||
|
|
||
|
// This method updates the particles by moving particle positions
|
||
|
// according to the velocity field and time step. That is, for each
|
||
|
// particle: p(t+1) = p(t) + dt * v(p(t)).
|
||
|
__global__ void
|
||
|
advectParticles_k(cData *part, cData *v, int dx, int dy,
|
||
|
float dt, int lb, size_t pitch) {
|
||
|
|
||
|
int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb;
|
||
|
int p;
|
||
|
|
||
|
// gtidx is the domain location in x for this thread
|
||
|
cData pterm, vterm;
|
||
|
if (gtidx < dx) {
|
||
|
for (p = 0; p < lb; p++) {
|
||
|
// fi is the domain location in y for this thread
|
||
|
int fi = gtidy + p;
|
||
|
if (fi < dy) {
|
||
|
int fj = fi * dx + gtidx;
|
||
|
pterm = part[fj];
|
||
|
|
||
|
int xvi = ((int)(pterm.x * dx));
|
||
|
int yvi = ((int)(pterm.y * dy));
|
||
|
vterm = *((cData*)((char*)v + yvi * pitch) + xvi);
|
||
|
|
||
|
pterm.x += dt * vterm.x;
|
||
|
pterm.x = pterm.x - (int)pterm.x;
|
||
|
pterm.x += 1.f;
|
||
|
pterm.x = pterm.x - (int)pterm.x;
|
||
|
pterm.y += dt * vterm.y;
|
||
|
pterm.y = pterm.y - (int)pterm.y;
|
||
|
pterm.y += 1.f;
|
||
|
pterm.y = pterm.y - (int)pterm.y;
|
||
|
|
||
|
part[fj] = pterm;
|
||
|
}
|
||
|
} // If this thread is inside the domain in Y
|
||
|
} // If this thread is inside the domain in X
|
||
|
}
|