CNK's blog

Postgres Makes Scheduling Easy

At work, we need to build a scheduling system. We want to present the user with a list of possible dates - and then the possible slots on that date. I don’t want to have all the possible empty slots in the database so I thought I would have to build them procedurally using Python.

import calendar
from datetime import timedelta
from pytz import timezone as pytz_timezone

AVAILABLE_DAYS = ['Monday', 'Wednesday', 'Friday']
AVAILABLE_START_TIME = {'hours': 8, 'minutes': 0, 'timezone': 'UTC'}
# start + estimated duration must be earlier than this
AVAILABLE_END_TIME = {'hours': 20, 'minutes': 0, 'timezone': 'UTC'}

def possible_times(start_date, end_date, estimated_duration, granularity=60):
    '''
    Returns a list of times when a user may start a reservation between start_date and end_date (inclusive)
    By default reservations may start hourly from AVAILABLE_START_TIME onwards;
    you may adjust how frequently reservations may start by setting the 'granularity' (in minutes)
    '''
    possibles = []
    date = _first_slot(start_date)
    while date <= end_date:
        if not _is_possible_day(date):
            # skip this day
            date += timedelta(days=1)
            continue

        # find slots on day
        last_slot = _last_slot(date, estimated_duration)
        while date <= last_slot:
            possibles.append(date)
            date += timedelta(minutes=granularity)

        # go to next day
        date = _first_slot(date + timedelta(days=1))

    return possibles


# ############## helper methods #######################

def _is_possible_day(date, available_days=None):
    if not available_days:
        available_days = AVAILABLE_DAYS
    return calendar.day_name[date.weekday()] in available_days


def _first_slot(date, start_time=None):
    '''Returns the first slot of the day'''
    if not start_time:
        start_time = AVAILABLE_START_TIME
    first_slot = date.replace(hour=start_time['hours'],
                              minute=start_time['minutes'],
                              tzinfo=pytz_timezone(start_time['timezone']))
    return first_slot


def _last_slot(date, duration, end_time=None):
    if not end_time:
        end_time = AVAILABLE_END_TIME
    last_slot = date.replace(hour=end_time['hours'],
                             minute=end_time['minutes'],
                             tzinfo=pytz_timezone(end_time['timezone']))
    last_slot -= duration
    return last_slot

The code above loops over the days in the range - and then on available days, loops over the hours in that day and returns a list of datetimes. There is a lot of ugly adding of Python timedelta objects and resetting the time to start iterating on a new day. It works - but the next step, eliminating slots that are already full, is going to be even uglier - lots of tedious “does this interval overlap with existing scheduled events”.

from datetime import datetime, timezone, timedelta
from django.test import TestCase
from ..utils import possible_times

class ReservationUtilsTests(TestCase):
    # ############### integration tests ##########################
    def test_no_possible_times_on_sunday(self):
        start_date = datetime(2017, 4, 30, hour=8, minute=0, tzinfo=timezone.utc)
        end_date = datetime(2017, 4, 30, hour=23, minute=0, tzinfo=timezone.utc)
        duration = 60  # in minutes
        slots = possible_times(start_date, end_date, duration)
        self.assertEqual(slots, [])

    def test_hourly_possible_times_end_earlier_if_takes_longer(self):
        start_date = datetime(2017, 4, 30, hour=8, minute=0, tzinfo=timezone.utc)
        end_date = datetime(2017, 5, 2, hour=23, minute=0, tzinfo=timezone.utc)
        expected = [datetime(2017, 5, 1, 8, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 9, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 10, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 11, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 12, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 13, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 14, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 15, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 16, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 17, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 18, 0, tzinfo=timezone.utc),
                    datetime(2017, 5, 1, 19, 0, tzinfo=timezone.utc),
                    ]
        slots = possible_times(start_date, end_date, estimated_duration=timedelta(minutes=60))
        self.assertEqual(len(slots), 12)
        self.assertEqual(slots, expected)
        slots2 = possible_times(start_date, end_date, estimated_duration=timedelta(minutes=120))
        self.assertEqual(len(slots2), 11)
        self.assertEqual(slots2, expected[0:11])

When I started looking into how to check the overlap, I started to looking into checking overlaps in the database - and found that a) Postgres has a date range data type (tstzrange), b) Django’s Postgres extensions has a field that wraps the Postgres tstzrange field (DateTimeRangeField), and c) the Postgres docs even have an example of how to create indexes that prevent you from scheduling more than one person to occupy a specific room at one time. All that ugly python, turns into:

import calendar
from django.db import connection

AVAILABLE_DAYS = ['Monday', 'Wednesday', 'Friday']
AVAILABLE_START_TIME = '08:00'
# start + estimated duruation must be earlier than this
AVAILABLE_END_TIME = '20:00'


def possible_times(start_date, end_date, estimated_duration, granularity=60):
    allowed_days = [list(calendar.day_name).index(day) + 1 for day in AVAILABLE_DAYS]

    cursor = connection.cursor()
        sql = '''
              SELECT *
              FROM   generate_series (timestamp %(start_date)s
                                      , timestamp %(end_date)s - interval '%(duration)s minutes'
                                      , interval '%(granularity)sm') h
              WHERE  EXTRACT(ISODOW FROM h) in %(allowed_days)s
                AND    h::time >= %(start_time)s
                AND    h::time <= %(end_time)s - interval '%(duration)s minutes'
                ;
              '''
        cursor.execute(sql, {'start_date': start_date,
                             'start_time': AVAILABLE_START_TIME,
                             'end_date': end_date,
                             'end_time': AVAILABLE_END_TIME,
                             'duration': estimated_duration,
                             'granularity': granularity,
                             'allowed_days': tuple(allowed_days),
                             })

    slots = [row[0] for row in cursor.fetchall()]
    return slots

The only slightly tricky part of that was restricting allowed days to MWF. I want my constant to use the day names, not the integers Postgres uses for days of the week. So I needed to import Python’s calendar module to convert “Monday” to an integer. Python uses 0 for Monday, but Postgres thinks Monday is 1, so add 1. Then it took me a little while to figure out how to pass a list into the query in a way that everything is properly interpolated and quoted; the trick: tuple(allowed_days).

Now I just need to join to my reservations table to exclude slots where the schedule is already full.

Forcing Django to Make a Subquery

Django has extensive documentation for it’s ORM but somehow I still end up surprised by some of the queries it builds. The default logging configuration doesn’t log queries in the way Rails does (in its development environment) so when a query appears to give the correct results, I don’t usually check the SQL. But I recently had a page start to fail; I finally tracked it down to a specific query but couldn’t immediately see why I was not getting the row I expected so I printed the query and Django was not building the query I thought it had been. The python is:

Material.objects.filter(goal_id=goal_key,
                        material_state='deployed',
                        category_id=category) \
                .exclude(individualizations__user__id=user_id,
                         individualizations__material_state__in=('done',)) \
                .select_related('category') \
                .order_by('category__position', 'created_at')

This produces the following SQL (edited slightly to make it easier to read):

SELECT "appname_materials".*, "appname_categories"."category_key", "appname_categories"."position"
FROM "appname_materials" INNER JOIN "appname_categories"
  ON ( "appname_materials"."category_key" = "appname_categories"."category_key" )
WHERE ("appname_materials"."material_state" = 'deployed'
       AND "appname_materials"."goal_key" = 'goal1'
       AND "appname_materials"."category_key" = 'transition'
       AND NOT ("appname_materials"."material_key" IN (SELECT U1."material_key" AS Col1
                                                       FROM "appname_material_individualizations" U1
                                                       WHERE U1."material_state" IN ('done'))
                AND "appname_materials"."material_key" IN (SELECT U1."material_key" AS Col1
                                                           FROM "appname_material_individualizations" U1
                                                           WHERE U1."user_id" = 1))
      )
ORDER BY "appname_categories"."position" ASC, "appname_materials"."created_at" ASC

Hmmm that’s not what I want. I don’t want 2 subqueries, one for each condition. I want one subquery, with two two conditions. If I had wanted 2 subqueries, I would have written 2 excludes, like this:

Material.objects.filter(goal_id=goal_key,
                        material_state='deployed',
                        category_id=category) \
                .exclude(individualizations__user__id=user_id) \
                .exclude(individualizations__material_state__in=('done',)) \
                .select_related('category') \
                .order_by('category__position', 'created_at')

But both of those QuerySet definitions produce the same SQL. So how can I produce the following SQL using the Django ORM:

SELECT "appname_materials".*, "appname_categories"."category_key", "appname_categories"."position"
FROM "appname_materials" INNER JOIN "appname_categories"
  ON ( "appname_materials"."category_key" = "appname_categories"."category_key" )
WHERE ("appname_materials"."material_state" = 'deployed'
       AND "appname_materials"."goal_key" = 'goal1'
       AND "appname_materials"."category_key" = 'transition'
       AND NOT "appname_materials"."material_key" IN (SELECT U1."material_key" AS Col1
                                                      FROM "appname_material_individualizations" U1
                                                      WHERE U1."material_state" IN ('done')
                                                      AND U1."user_id" = 1)
       )
ORDER BY "appname_categories"."position" ASC, "appname_materials"."created_at" ASC

I tried a couple of things using Q but mostly ended up with syntax errors. Fortunately I finally found this Stack Overflow thread with references the bug report for this problem AND the solution. You can force Django to build the desired subquery by writing the subquery explicitly:

Material.objects.filter(goal_id=goal_key,
                        material_state='deployed',
                        category_id=category) \
                .exclude(
                     material_key__in=(
                         MaterialIndividualization.objects.values_list('material_id', flat=True)
                        .filter(user__id=user_id, material_state__in=('done',))
                     )
                 ) \
                .select_related('category') \
                .order_by('category__position', 'created_at')

It’s a little verbose, but it is actually a little clearer in some respects - it is more like a direct python translation of the desired SQL.

Tuning Django REST Framework Serializers

One problem that often comes up when you are using an object-relational mapper is called the N+1 query problem - inadvertently doing a query and then doing a separate query for the related objects for each row. When building sites using Ruby on Rails, the framework logs all SQL queries (while you are in development mode). So one tends to fix these inefficient queries as you are developing - if nothing else, in self-defense so you can actually see the things you care about in your logs.

Django, on the other hand, does not log anything except the timestamp, request, response_code, and response size. Its default logging configuration doesn’t log any request parameters or database queries. So it’s easy to overlook inefficient queries. So when we finally put a reasonable amount of test data into our staging server, we found that several of our API endpoints were agonizingly slow. So, time for some tuning!

Setup

Lots of people use the django debug toolbar but I really prefer log files. So I installed and configured Django Query Inspector. That was helpful for identifying some of the worst offenders but for the real tuning, I needed this stanza to log all database queries:

LOGGING = {
    'version': 1,
    'disable_existing_loggers': False,
    'handlers': {
        'console': {
            'level': 'DEBUG',
            'class': 'logging.StreamHandler',
        }
    },
    'loggers': {
        'django.db.backends': {
            'handlers': ['console'],
            'level': 'DEBUG',
        },
    }
}

Once I had that going, I started looking at some of my nested serializers. With a couple of well placed “select_related”s on the queries in my views, I was able to get rid of most of the excess queries but I was consistently seeing an extra query that I couldn’t figure out - until I started to write up an issue to post on IRC.

The extra query was coming in because I was using DRF’s browsable API to do my query tuning. The browsable API includes a web form for experimenting with the create and update actions in a ModelViewSet and that form has a select menu for each foreign key relationship that needs to be created. So when I made a request in the browser, I saw:

(0.000) QUERY = '
SELECT "project_goal"."id", "project_goal"."name",
       "project_goal"."metagoal_id", "project_metagoal"."id",
       "project_metagoal"."name", "project_metagoal"."project_id"
FROM "project_goal" INNER JOIN "project_metagoal"
  ON ("project_goal"."metagoal_id" = "project_metagoal"."id" )
WHERE "project_goal"."id" = %s' - PARAMS = (3,); args=(3,)

(0.000) QUERY = '
SELECT "project_metagoal"."id",
       "project_metagoal"."name", "project_metagoal"."project_id"
FROM "project_metagoal"' - PARAMS = (); args=()

[SQL] 2 queries (0 duplicates), 0 ms SQL time, 101 ms total request time
[15/Jul/2016 01:40:53] "GET /api/goals/3/ HTTP/1.1" 200 10565

But when I made the same request using curl, I only see the one join query that I was expecting:

$ curl http://127.0.0.1:8000/api/goals/3/ | jq .
{"id": 3,
 "url": "http://127.0.0.1:8000/api/goals/3/",
 "name": "Subgoal 3",
 "metagoal": "http://127.0.0.1:8000/api/metagoals/1/"
}

(0.000) QUERY = '
SELECT "project_goal"."id", "project_goal"."name",
       "project_goal"."metagoal_id", "project_metagoal"."id",
       "project_metagoal"."name", "project_metagoal"."project_id"
FROM "project_goal" INNER JOIN "project_metagoal"
  ON ("project_goal"."metagoal_id" = "project_metagoal"."id" )
WHERE "project_goal"."id" = %s' - PARAMS = (3,); args=(3,)

[SQL] 1 queries (0 duplicates), 0 ms SQL time, 12 ms total request time
[15/Jul/2016 01:40:47] "GET /api/goals/3/ HTTP/1.1" 200 5398

Bash_it Using Git Diff as Diff

I used Kitchenplan to set up my new mac. There is newer configuration option based on Ansible by the same author - Superlumic. I would like to try it but didn’t have time to experiment with this time around.

The big plus for using Kitchenplan was that our small development team ended up with Macs that are all configured more or less the same way. Another plus is it installs bash_it which does a lot more shell configuring than I have ever bothered to do. The only thing I have found not to like is that it wants to invoke git’s diff tool instead of the regular unix diff. To shut that off, I just edited the place where that was set up. In /etc/bash_it/custom/functions.bash (line 72) I commented out:

# Use Git’s colored diff when available
hash git &>/dev/null
if [ $? -eq 0 ]; then
  function diff() {
    git diff --no-index --color-words "$@"
  }
fi

Testing File Uploads (in Django)

I am trying to improve the test coverage of our work project and needed to test the avatar upload that is associated with creating users in our project. I didn’t find any place that laid out how to test file uploads. Fortunately the tests for the easy-thumbnails app we use are pretty good and I was able to piece something together using their code as a model.

In case anyone else is looking for something like this, I updated my easy thumbnails example project to include a couple of tests.

from PIL import Image
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase, Client
from django.core.urlresolvers import reverse
from django.utils.six import BytesIO
from .factories import UserFactory
from .models import UserProfile


# "borrowed" from easy_thumbnails/tests/test_processors.py
def create_image(storage, filename, size=(100, 100), image_mode='RGB', image_format='PNG'):
    """
    Generate a test image, returning the filename that it was saved as.

    If ``storage`` is ``None``, the BytesIO containing the image data
    will be passed instead.
    """
    data = BytesIO()
    Image.new(image_mode, size).save(data, image_format)
    data.seek(0)
    if not storage:
        return data
    image_file = ContentFile(data.read())
    return storage.save(filename, image_file)


class UserTests(TestCase):
    def setUp(self):
        self.user = UserFactory(username='me')

    # deleting the user will remove the user, the user_profile, AND the avatar image
    def tearDown(self):
        self.user.delete()

    def test_adding_an_avatar_image(self):
        # make sure we start out with no UserProfile (and thus no avatar)
        self.assertIsNone(UserProfile.objects.filter(user_id=self.user.id).first())
        myClient = Client()
        myClient.login(username=self.user.username, password='password')

        # set up form data
        avatar = create_image(None, 'avatar.png')
        avatar_file = SimpleUploadedFile('front.png', avatar.getvalue())
        form_data = {'avatar': avatar}

        response = myClient.post(reverse('avatar_form'), form_data, follow=True)
        self.assertRegex(response.redirect_chain[0][0], r'/users/profile/$')
        # And now there is a user profile with an avatar
        self.assertIsNotNone(self.user.profile.avatar)

    def test_uploading_non_image_file_errors(self):
        # make sure we start out with no UserProfile (and thus no avatar)
        self.assertIsNone(UserProfile.objects.filter(user_id=self.user.id).first())
        myClient = Client()
        myClient.login(username=self.user.username, password='password')

        # set up form data
        text_file = SimpleUploadedFile('front.png', b'this is some text - not an image')
        form_data = {'avatar': text_file}

        response = myClient.post(reverse('avatar_form'), form_data, follow=True)
        self.assertFormError(response, 'avatar_form', 'avatar',
                             'Upload a valid image. The file you uploaded was either not an image or a corrupted image.')