Can your system withstand the process chaos monkey?

As part of my efforts to constantly improve and harden Rebus, I have recently put it through some testing that should reveal any weaknesses around how it handles transactions in the face of failure.

My most recent test involved a client, that would send a number of sequentially numbered messages to a FragileProcessor, that would process these messages and save the contained number in an SQL Server – and in doing this, FragileProcessor would constantly crash and burn!

To help me do this, I devised a simple application called ProcessChaosMonkey, containing this evil beast (whose name is inspired – a.k.a. shamelessly ripped off – of Netflix’s chaos monkey):

public class TheMonkey : IDisposable
{
    readonly string[] processNamesToKill;
    readonly Random random = new Random();
    readonly Thread monkeyThread;

    volatile bool keepWorking = true;

    public TheMonkey(string[] processNamesToKill)
    {
        this.processNamesToKill = processNamesToKill;
        monkeyThread = new Thread(DoMonkeyWork);
    }

    void DoMonkeyWork()
    {
        while (true)
        {
            var secondsToWait = random.NextDouble() * 3 + 4; //< secret biz consts
            var startTime = DateTime.UtcNow;

            while (keepWorking && !EnoughTimeElapsed(startTime, secondsToWait))
                Thread.Sleep(100);

            if (!keepWorking) break;

            Fail();
        }
    }

    static bool EnoughTimeElapsed(DateTime startTime, double secondsToWait)
    {
        return (DateTime.UtcNow - startTime).TotalSeconds >= secondsToWait;
    }

    void Fail()
    {
        var randomIndex = random.Next(processNamesToKill.Length);
        var processToKill = processNamesToKill[randomIndex];

        try
        {
            var processes = Process.GetProcessesByName(processToKill).ToList();
                
            Console.WriteLine("Killing '{0}'!! ({1} instances running)",
                                processToKill, processes.Count);
                
            processes.ForEach(p => p.Kill());
        }
        catch(Exception e)
        {
            Console.WriteLine("An error occurred while attempting to kill '{0}': {1}",
                                processToKill, e);
        }
    }

    public void Start()
    {
        var processNames = string.Join(", ", processNamesToKill);
        Console.WriteLine(@"Unleashing the monkey on: {0}", processNames);
        monkeyThread.Start();
    }

    public void Dispose()
    {
        Console.WriteLine("Stopping the monkey");
        keepWorking = false;
        monkeyThread.Join();
    }
}

public class TheMonkey : IDisposable

{

readonly string[] processNamesToKill;

readonly Random random = new Random();

readonly Thread monkeyThread;

volatile bool keepWorking = true;

public TheMonkey(string[] processNamesToKill)

{

this.processNamesToKill = processNamesToKill;

monkeyThread = new Thread(DoMonkeyWork);

}

void DoMonkeyWork()

{

while (true)

{

var secondsToWait = random.NextDouble() * 3 + 4; //< secret biz consts

var startTime = DateTime.UtcNow;

while (keepWorking && !EnoughTimeElapsed(startTime, secondsToWait))

Thread.Sleep(100);

if (!keepWorking) break;

Fail();

}

static bool EnoughTimeElapsed(DateTime startTime, double secondsToWait)

{

return (DateTime.UtcNow - startTime).TotalSeconds >= secondsToWait;

}

void Fail()

{

var randomIndex = random.Next(processNamesToKill.Length);

var processToKill = processNamesToKill[randomIndex];

try

{

var processes = Process.GetProcessesByName(processToKill).ToList();

Console.WriteLine("Killing '{0}'!! ({1} instances running)",

processToKill, processes.Count);

processes.ForEach(p => p.Kill());

}

catch(Exception e)

{

Console.WriteLine("An error occurred while attempting to kill '{0}': {1}",

processToKill, e);

}

public void Start()

{

var processNames = string.Join(", ", processNamesToKill);

Console.WriteLine(@"Unleashing the monkey on: {0}", processNames);

monkeyThread.Start();

}

public void Dispose()

{

Console.WriteLine("Stopping the monkey");

keepWorking = false;

monkeyThread.Join();

}

which I would then run like this: ProcessChaosMonkey.exe FragileProcessor, causing the FragileProcessor process to be constantly shot down at random times.

As FragileProcessor is a Windows Service, I configured it to recover immediately upon crashing. This way, I could leave “my system” running, processing thousands and thousands of messages, and then I could come back and see if messages had been dropped. This way of planning for failure is in my opinion the only way to go if you want to build systems, that can’t fail. Do you think your system can withstand ProcessChaosMonkey?

Oh, and if you’re interested in how the most recent testing went, stay tuned for the next post where I’ll talk a little bit about delivery guarantees…

Can your system withstand the process chaos monkey?

Like this:

Related

Leave a Reply Cancel reply

Share this:

Like this:

Related

Leave a Reply Cancel reply