Doctests or nosetests?

I had a torrid love affair with doctests. It was AMAZING! You mean you can write documentation AND test your code at the same time!? You can write a line of code, put down the answer next to it and the computer WILL CHECK IT FOR YOU?

Well, this went on for several months. Then one morning I woke up next to this:

def handle_variant(variant, f_vseq, f_vseq_pos, strand_no=None):
  """Write the variant to the mutated sequence and fill out the pos array

  Some setup
  >>> import io, vcf, struct
  >>> def add_GT(v, gt):
  ...   md=vcf.model.make_calldata_tuple('GT')(gt)
  ...   call=vcf.model._Call(v,'sample',md)
  ...   v.samples = [call]
  ...   v._sample_indexes = {'sample': 0}

  Test with SNP: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('T')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with SNP: homo, strand 0
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with SNP: homo, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)



  Test with SNP: het, strand 0
  >>> add_GT(variant, '0/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0

  Test with SNP: het, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with delete: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'CTG'; \
  alt = [vcf.model._Substitution('C')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/0'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  4
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  C
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with same delete, strand 1 (same as REF)
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 4; \
  ref = 'GACTG'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('CGGG')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  CGGG
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (2, 3, 3, 3)


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('GTTT')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  GTTT
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (8, 9, 9, 9)

  See Readme.md for details of algorithm
  """
  var_type = '1' if strand_no is None else variant.genotype('sample').data.GT.split('/')[strand_no]
  # 0 means REF 1 means ALT. If we don't specify a strand number it means we don't care about Zygosity and will always
  # take the ALT
  if var_type == '1':
    alt = variant.ALT[0].sequence
    if alt == '.': alt = ''
    ref = variant.REF
    if ref == '.': ref = ''
    f_vseq.write(alt)    # Copy over ALT
    new_ref_pos = len(ref) + variant.POS - 1  # Advance along ref_seq
                                              # -1 because POS is 1-indexed, we are 0-indexed internally
    if len(alt) > 0:
      if len(ref) > 0:
        f_vseq_pos.write(struct.pack('I', variant.POS))  # The original base
      if len(alt) > 1:
        f_vseq_pos.write(struct.pack('{:d}I'.format(len(alt) - len(ref)), *[new_ref_pos + 1] * (len(alt) - len(ref))))
  else:
    new_ref_pos = variant.POS - 1  # Keep us here - we didn't implement this variant and we should keep copying
                                   # -1 because POS is 1-indexed, we are 0-indexed internally
  return new_ref_pos

There were several functions like this. I was refactoring this code and when I would work on these functions I would spend minutes trying to find the actual body of the function, often puzzling over what turned out to be a doctest and not the actual code. I had been warned about this by well meaning relatives.

Doctests are awesome when you can combine a test and example and it is pithy. When you need to setup complex data structures or you need to test lots of use cases, it is better to put tests in a separate file, in a separate directory and run them with nose.

Doctests can still be used in the documentation as examples of using functions with the added benefit that the examples are kept upto date by being tested.

h5py and multiprocessing

The HDF5 format has been working awesome for me, but I ran into danger when I started to mix it with multiprocessing. It was the worst kind of danger: the intermittent error. Here are the dangers/issues in order of escalation (TL;DR is use a generator to feed data from your file into the child processes as they spawn. It's the easiest way. Read on for harder ways.) An h5py file handle can't be pickled and therefore can't be passed as an argument using pool.map() If you set the handle as a global and access it from the child processes you run the risk of racing which leads to corrupted reads. My personal runin was that my code sometimes ran fine but sometimes would complain that there are NaNs or Infinity in the data. This wasted some time tracking down. Other people have had this kind of problem [ 1 ]. Same problem if you pass the filename and have the different processes open individual instances of the file separately. The hard way to solve this problem is to sw...

Assorted Experience

Search This Blog