5/31/2008

API Hooking on Windows Platform

DLL hooking is an useful technique when you are doing debugging, testing and hacking. In the following code, I will show how to hook a system API (GetCurrentProcessId()) called in a 3rd party DLL (MyNetwork.DLL).

The import section of a DLL is conceptually organized as:
  1 ///////////////////////////////////////////////////////////////////////////////
  2 // hooker.h
  3 ///////////////////////////////////////////////////////////////////////////////
  4 #include <windows.h>
  5
  6 class GetPidHooker
  7 {
  8 public:
  9    GetPidHooker(int nDiff);
 10    ~GetPidHooker();
 11
 12 private:
 13    static DWORD WINAPI HookedGetPid(void);
 14    void   HookIt(FARPROC pfnOld, FARPROC pfnNew);
 15
 16    typedef DWORD (*GetPIDFuncType)(void);
 17
 18 private:
 19    static int m_nDiff;
 20    static FARPROC m_pfnOrgMethod;
 21 };
 22
 23
 24 ///////////////////////////////////////////////////////////////////////////////
 25 // hooker.cpp
 26 ///////////////////////////////////////////////////////////////////////////////
 27 #include <windows.h>
 28 #include <dbghelp.h>
 29 #include <stdio.h>
 30 #include "hooker.h"
 31
 32 int GetPidHooker::m_nDiff = 0;
 33 FARPROC GetPidHooker::m_pfnOrgMethod = NULL;
 34
 35 GetPidHooker::GetPidHooker(int nDiff)
 36 {
 37    m_nDiff = nDiff;
 38
 39    // GetCurrentProcessId() is in kernel32.dll
 40    HMODULE hOldModule = GetModuleHandle(L"kernel32.dll");
 41    if (NULL == hOldModule)
 42    {
 43        printf("GetPidHooker:: Can't find MODULE - kernel32.dll!\n");
 44        return;
 45    }
 46    m_pfnOrgMethod = GetProcAddress(hOldModule, "GetCurrentProcessId");
 47    if (NULL == m_pfnOrgMethod)
 48    {
 49        printf("GetPidHooker:: Can't locate address of function - GetCurrentProcessID!\n");
 50        return;
 51    }
 52
 53    // Do the hooking
 54    FARPROC pfnNew = (FARPROC)HookedGetPid;
 55    HookIt(m_pfnOrgMethod, pfnNew);
 56 }
 57
 58 void GetPidHooker::HookIt(FARPROC pfnOld, FARPROC pfnNew)
 59 {
 60    // get the first Import Descriptor in Import Directory Table of MyNetwork.dll
 61    ULONG ulSize = 0;
 62    HMODULE hCallerModule = GetModuleHandle(L"MyNetwork.dll");
 63    if (NULL == hCallerModule)
 64    {
 65            printf("GetPidHooker:: can't find MODULE - MyNetwork.dll\n");
 66            return;
 67    }
 68    PIMAGE_IMPORT_DESCRIPTOR pImportDesc =
 69        (PIMAGE_IMPORT_DESCRIPTOR)ImageDirectoryEntryToData(
 70            hCallerModule,
 71            TRUE,
 72            IMAGE_DIRECTORY_ENTRY_IMPORT,
 73            &ulSize);
 74    if (NULL == pImportDesc)
 75    {
 76        printf("GetPidHooker:: Can't locate Import section of MyNetwork.dll!\n");
 77        return;
 78    }
 79
 80    // find KERNEL32.dll's Import Descriptor
 81    for (; pImportDesc->Name; pImportDesc++)
 82    {
 83        PSTR pszModName = (PSTR)((PBYTE)hCallerModule + pImportDesc->Name);
 84        //NOTE: PE format only use ascii chars
 85        if (0 == lstrcmpiA(pszModName, "kernel32.dll"))
 86        {
 87            break;
 88        }
 89    }
 90    if (0 == pImportDesc->Name)
 91    {
 92        printf("GetPidHooker:: Can't locate KERNEL32.dll in MyNetwork.dll's import section!\n");
 93        return;
 94    }
 95
 96    // loop to find the thunk for GetCurrentProcessId() in kernel32.dll's IAT
 97    PIMAGE_THUNK_DATA pThunk = (PIMAGE_THUNK_DATA)((PBYTE)hCallerModule + pImportDesc->FirstThunk);
 98    for (; pThunk->u1.Function; pThunk++)
 99    {
100        // Is this the function we're looking for?
101        if (*(FARPROC*)(&pThunk->u1.Function) == pfnOld)
102        {
103            // hook it - write new address, the old address has been saved elsewhere
104            WriteProcessMemory(
105                GetCurrentProcess(),
106                (LPVOID)(&pThunk->u1.Function),
107                &pfnNew,
108                sizeof(pfnNew),
109                NULL);
110            break;
111        }
112
113    }
114    if (0 == pThunk->u1.Function)
115    {
116        printf("GetPidHooker:: Can't locate specified address in MyNetwork.dll's import section!\n");
117    }
118 }
119
120 GetPidHooker::~GetPidHooker()
121 {
122    // restore the win32 api address in silknetworklib.dll's IAT.
123    FARPROC pfnOld = (FARPROC)HookedGetPid;
124
125    if (NULL != m_pfnOrgMethod)
126    {
127        HookIt(pfnOld, m_pfnOrgMethod);
128    }
129 }
130
131 DWORD GetPidHooker::HookedGetPid(void)
132 {
133    return ((*(GetPIDFuncType)m_pfnOrgMethod)() + m_nDiff);
134 }
135
136 ///////////////////////////////////////////////////////////////////////////////
137 //  main.cpp
138 ///////////////////////////////////////////////////////////////////////////////
139 #include <windows.h>
140 #include <stdio.h>
141 #include "hooker.h"
142 #include "MyNetwork.h"
143
144 int __cdecl main(int argc, const char** argv)
145 {
146     printf("Before Hook - current pid is %d\n", MyNetwork::GetCurPID());
147
148     {
149         GetPidHooker gHooker(5);  
150         printf("In Hook - current pid is %d\n", MyNetwork::GetCurPID());
151     }
152
153     printf("After Hook - current pid is %d\n", MyNetwork::GetCurPID());
154 }
155

NOTE:
1. Add dbghelp.lib as your addtional reference library.
2. Make sure that you are referencing MyNetwork.Dll and MyNetwork.dll calls GetCurrentProcessId() in its implementation.
3. Mem Pages that hold DLL data is copy-on-write, so it's safe when multiple processes share the same DLL. But my code here is not thread safe within one process, you can use any thread synchronization mechanism to ensure thread-safety.

full source code package:
http://code4cs.googlecode.com/files/DllHookWin.zip

5/25/2008

Capture Console App Output Using Pipe

  Pipe is an important and useful IPC mechanism on Windows Platform.Capturing console application output is often required for debugger or mulit-process architecture applications. The basic console output caputring task leverages stdio redirection provided by OS. Files can be used when redirect stdout/stdin, but Pipe provides more elegant way to do so. For example, it doesn't need extra file creation and deletion, write/read end can work in a pipeline(or consumer/producer) style, reader can work before writer finish all his tasks.

  Here is the code to combine anonymous Pipe and STDIO redirection to caputre console application output:
 1 #include <string>
 2 #include <sstream>
 3 #include <windows.h>
 4 #include <vector>
 5
 6 int __cdecl main(int argc, const char** argv)
 7 {
 8     (void)argc;
 9     (void)argv;
10     // connect pipe
11     SECURITY_ATTRIBUTES secAttr = {0};
12     secAttr.nLength = sizeof(SECURITY_ATTRIBUTES);
13     secAttr.lpSecurityDescriptor = 0;
14     secAttr.bInheritHandle = TRUE;
15     HANDLE hPipeRead = NULL, hPipeWrite = NULL;
16     if (!CreatePipe(&hPipeRead, &hPipeWrite, &secAttr, 0))
17     {
18         printf("CreatePipe Failed Due to Win32 Error:%d", GetLastError());
19         return -1;
20     }
21
22     // create dir process
23     const char * cmdLine = "cmd.exe /C dir /B /A:-D /O:D /T:W ";
24     STARTUPINFOA startInfo = {0};
25     startInfo.cb = sizeof(STARTUPINFOA);
26     startInfo.hStdOutput = hPipeWrite;
27     startInfo.dwFlags |= STARTF_USESTDHANDLES;
28     PROCESS_INFORMATION psInfo = {0};
29     if (!CreateProcessA(NULL, (LPSTR)cmdLine, NULL, NULL, TRUE, NULL, NULL, NULL, &startInfo, &psInfo))
30     {
31         printf("Create Dir Process Failed Due To Win32 Error:%d", GetLastError());
32         CloseHandle(hPipeRead);
33         CloseHandle(hPipeWrite);
34         return -1;
35     }
36
37     // read dir process output
38     DWORD dwRead = 0;
39     char dataBuf[MAX_PATH];
40     std::string outString;
41     outString.reserve(32 * 1024);
42     CloseHandle(hPipeWrite);
43     do
44     {
45         if (!ReadFile(hPipeRead, dataBuf, MAX_PATH, &dwRead, NULL))
46         {
47             if (GetLastError() == ERROR_BROKEN_PIPE)
48             // No more data
49             {
50                 break;
51             }
52             else
53             // Unkown error
54             {
55                 printf("Read From Pipe Failed Due to Win32 Error:%d", GetLastError());
56                 CloseHandle(hPipeRead);
57                 return -1;
58             }
59         }
60         if (dwRead != 0)
61         {
62             outString.append(dataBuf, dwRead);
63         }
64     } while (dwRead != 0);
65     CloseHandle(hPipeRead);
66
67     // parse output
68     std::stringstream outStream(outString);
69     std::string oneLine;
70     std::vector<std::string> fileVec;
71     oneLine.clear();
72     while (std::getline(outStream, oneLine))
73     {
74         if (!oneLine.empty())
75         {
76             // remove tailing nl/cr char
77             fileVec.push_back(oneLine.substr(0, oneLine.length() -1));
78         }
79         oneLine.clear();
80     }
81     return 0;
82 }


  NOTE on anonymous pipe:
1. Parent should use some way(other IPC mechanism) to pass Pipe handle(read/write) to child process.
2. Anonymous pipe returned handle is inheritable only when secAttr structure set bInheritHandle field to TRUE.
3. In order to make child process use those handles passed by parent, parent should set the 5th parameter to TRUE when create child process.
4. Anonymous pipe will exist if some read/write handle is connected to it.
5. If all write handles to anonymous pipe are closed, ReadFile() operations on that pipe will return ERROR_PIPE_BROKEN when all data are read out.

5/14/2008

on the I/O Completion Port Mechanism

Microsoft provides a great OS infrastructure called I/O Completion Port to help developer building high performance server applications. In this article I will try to explain how it works and how to use it.

The main motivation for iocp is to improve i/o performance and the basic ideas behind the design of iocp are:
- async i/o results are posted to a kernel queue
- worker thread fetch and process one result at a time
- leverage resources as much as possible but also avoid context switching as much as possible

Before going further, let's clarify some concepts behind iocp:
1. async i/o - (multiple) i/o requests are posted to device driver asynchronously
2. consumer/producer model - most async model is a typical consumer/producer model: client thread produce i/o request packets, device driver consume them; device driver produce i/o result packets, worker thread consume them
3. queue - queue is a great facility for thread cooperation. two executing threads that works asynchronously can leverage queue to implement a consumer/producer model

So what's iocp and what can it provide?
1. you can regard iocp as a Queue kernel object, with some auxiliary data structure and routines, that is designed for aysnc i/o result processing
2. if you associate an iocp with i/o handle (file/network/namedpipe/mailslot), all subsequent i/o requests' results will be queued to this iocp
3. these queued async i/o results can be fetched from iocp (in so called "worker thread")
4. you can post completion status(faked i/o result) to iocp directly. This can be used to implement application level consume/producer cooperation model
5. iocp can control max # of active associated worker threads to reduce context switching overhead

┌--> kernel device driver ---┐
| |
(async i/0 request) |
| (i/o completion status)
| |
| (direct V
client thread -------------> iocp[queue embedded]
completion |
status) |
(completion status)
|
V
worker threads
     
        [iocp based application architecture]

How the whole stuff works?
1. you issue async i/o to device driver
2. device driver complete i/o request and post the result to the queue in iocp
3. when an i/o result comes, iocp checks current active(ready state) worker thread associated with it , if the # is below the max value specified when creating iocp, and some worker thread is waiting on the queue, one of them(in LIFO manner) is waked up to continue execution
4. worker thread will query iocp when finishing current task(or creation), if the queue is empty or active worker thread is above max setting, it will wait on this queue until iocp wake it up as described in step 3
5. an thread is said to be "associated" with an iocp when it calls GetQueuedCompletionStatus() API
6. the association between iocp and thread ends when:
- thread terminating
- iocp closing
- other associating

How to make use of this great mechanism?
1. create iocp
2. associate it with i/o device handles
3. create worker thread for this iocp
4. issue async i/o requests on those i/o device handles

Let's see a concrete example that leverages iocp to improve performance.
Problem & Requirement
1. You want your computer to do some sum computing works(each one is small, but huge amount of works)
2. You want each core of your processor to do some works

Solution
1. Build some kind of worker threads that can do sum computing
2. Worker threads get work items from Iocp, client send computing requests to Iocp

main.cpp - client of the Iocp Server
 1 #include <windows.h>
 2 #include <stdio.h>
 3 #include "iocpcomm.h"
 4 #include "iocpserver.h"
 5
 6 volatile LONGLONG   g_sum = 0;
 7 volatile LONG g_curWorkCount = 0;
 8
 9 const int WORK_COUNT = 100;
10
11 struct WORK_ITEM : OVERLAPPED
12 {
13     int m_nMin;
14     int m_nMax;
15 };
16
17 // called by worker thread on Iocp
18 DWORD WINAPI WorkItemProccessor(LPVOID param)
19 {
20     OVERLAPPED_ENTRY* ole = (OVERLAPPED_ENTRY *)param;
21     WORK_ITEM* wi = (WORK_ITEM*)ole->lpOverlapped;
22     
23     // compute partical sum
24     __int64 mySum = 0;
25     for (int i = wi->m_nMin; i <= wi->m_nMax; ++i)
26     {
27         mySum += i;
28     }
29
30     // add partical result to global result
31     InterlockedExchangeAdd64(&g_sum, mySum);
32
33     InterlockedExchangeAdd(&g_curWorkCount, -1);
34
35     return 0;
36 }
37
38 int main(int argc, char** argv)
39 {
40     //g_showDetailInfo = TRUE;
41
42     // startup iocp server
43     StartupIocpServer(WorkItemProccessor);
44
45     // create & post some work items
46     g_curWorkCount = WORK_COUNT;
47     WORK_ITEM arraWI[WORK_COUNT];
48     ZeroMemory(arraWI, sizeof(WORK_ITEM) * WORK_COUNT);
49
50     for (int i = 0; i < WORK_COUNT; ++i)
51     {
52         arraWI[i].m_nMax = (i + 1) * 10000 - 1;
53         arraWI[i].m_nMin = i * 10000;
54         QueueWorkToIocpServer(0, 0, arraWI + i);
55     }
56
57     // poll the work progress
58     bool isWorkDone = false;
59     while (!isWorkDone)
60     {
61         Sleep(1000);
62         isWorkDone = (g_curWorkCount == 0);
63     }
64
65     // check final results
66     printf("the final results are [%I64d]\n", g_sum);
67
68     StopIocpServer();
69
70     // check thread load statistics
71     for (std::map<int, int>::const_iterator iter = g_mapThreadLoad.begin(); iter != g_mapThreadLoad.end(); ++iter)

72     {
73         printf("thread [%d] processed %d work items\n", iter->first, iter->second);
74     }
75 }


IocpServer.cpp - iocp server implementation
 1 DWORD WINAPI WorkerThreadProc(LPVOID param)
 2 {
 3     LPTHREAD_START_ROUTINE taskProcessor = (LPTHREAD_START_ROUTINE)param;
 4
 5     OVERLAPPED_ENTRY olEntry = {0};
 6     while (true)
 7     {
 8         // get completion packet on Iocp
 9         if (GetQueuedCompletionStatus(
10                 g_hIocp,
11                 &(olEntry.dwNumberOfBytesTransferred),
12                 &(olEntry.lpCompletionKey),
13                 &(olEntry.lpOverlapped),
14                 INFINITE))
15         {
16             if (olEntry.lpCompletionKey == ckStopCommand)
17             {
18                 g_showDetailInfo ? printf("worker thread [%d] ends gracefully.\n", GetCurrentThreadId()) : void();
19                 break;
20             }
21             else
22             {
23                 g_showDetailInfo ? printf("worker thread [%d] will process one request.\n", GetCurrentThreadId()) : void();
24                 g_mapThreadLoad[GetCurrentThreadId()]++;
25                 taskProcessor((LPVOID)&olEntry);
26             }
27         }
28         else
29         {
30             // report error and exit this thread
31             g_showDetailInfo ? printf("Error when query completion status on Iocp due to: [%d]\n", GetLastError()) : void();
32             break;
33         }
34     }
35
36     return 0;
37 }
38
39 BOOL StartupIocpServer(LPTHREAD_START_ROUTINE taskProcessor)
40 {
41     // Get # of system Processor
42     SYSTEM_INFO sysInfo;
43     GetSystemInfo(&sysInfo);
44     DWORD coreCount = sysInfo.dwNumberOfProcessors;
45
46     // Create Iocp
47     g_hIocp = CreateNewCompletionPort(coreCount);
48     if (g_hIocp == NULL)
49     {
50         printf("Failed to create Iocp due to: [%d]\n", GetLastError());
51         return FALSE;
52     }
53
54     // Startup worker threads
55     DWORD dwThreadId = 0;
56     dwThreadCount = coreCount * 2;
57     g_arrThreads = (HANDLE*) new unsigned char[sizeof(HANDLE) * dwThreadCount];
58     for (DWORD i = 0; i < dwThreadCount; ++i)
59     {
60         HANDLE hThread = CreateThread(NULL, 0, WorkerThreadProc, (LPVOID)taskProcessor, 0, &dwThreadId);
61         if (hThread == NULL)
62         {
63             printf("Failed to create worker thread due to: [%d]\n", GetLastError());
64             
65             // should clean created threads here
66             delete[] g_arrThreads;
67             g_arrThreads = NULL;
68             return FALSE;
69         }
70         else
71         {
72             g_arrThreads[i] = hThread;
73             printf("Thread [%d] created successfully\n", dwThreadId);
74         }
75     }
76
77     return TRUE;
78 }
full source code package: http://code4cs.googlecode.com/files/Iocp4W.zip

Notes:
1. the win32 API - CreateIoCompletionPort() is overloaded on its semantic, I have divide it into two APIs in my code(see IocpComm.h):CreateNewIoCompletionPort(),AssociateIoCompletionPortWithDevice()
2. the OVERLAPPED(or derived struct) object(mem address) created at line47@main.cpp will be retrieved/accessed at the worker thread at line13@IocpServer.cpp. So you must ensure these data structure is not destroyed before the worker threads complete.
3. the g_showDetailInfo var at line40@main.cpp is used to control output information. If turned on, it will introduces i/o operation in worker threads. This will greatly impact the worker threads scheduling. You can try truning on/off to see the different result and explain what happened behind the scene.
4. essentially, Iocp is a system Queue with some thread scheduling improvements.

[Reference]
1. iocp on Windows
2. Inside windows iocp
3. Async I/O & iocp
4. iocp on Solaris
5. Thread Pooling