Sunday, July 6, 2014

Doctests or nosetests?

I had a torrid love affair with doctests. It was AMAZING! You mean you can write documentation AND test your code at the same time!? You can write a line of code, put down the answer next to it and the computer WILL CHECK IT FOR YOU?

Well, this went on for several months. Then one morning I woke up next to this:

def handle_variant(variant, f_vseq, f_vseq_pos, strand_no=None):
  """Write the variant to the mutated sequence and fill out the pos array

  Some setup
  >>> import io, vcf, struct
  >>> def add_GT(v, gt):
  ...   md=vcf.model.make_calldata_tuple('GT')(gt)
  ...   call=vcf.model._Call(v,'sample',md)
  ...   v.samples = [call]
  ...   v._sample_indexes = {'sample': 0}

  Test with SNP: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('T')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with SNP: homo, strand 0
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with SNP: homo, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)



  Test with SNP: het, strand 0
  >>> add_GT(variant, '0/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0

  Test with SNP: het, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with delete: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'CTG'; \
  alt = [vcf.model._Substitution('C')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/0'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  4
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  C
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with same delete, strand 1 (same as REF)
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 4; \
  ref = 'GACTG'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('CGGG')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  CGGG
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (2, 3, 3, 3)


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('GTTT')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  GTTT
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (8, 9, 9, 9)

  See Readme.md for details of algorithm
  """
  var_type = '1' if strand_no is None else variant.genotype('sample').data.GT.split('/')[strand_no]
  # 0 means REF 1 means ALT. If we don't specify a strand number it means we don't care about Zygosity and will always
  # take the ALT
  if var_type == '1':
    alt = variant.ALT[0].sequence
    if alt == '.': alt = ''
    ref = variant.REF
    if ref == '.': ref = ''
    f_vseq.write(alt)    # Copy over ALT
    new_ref_pos = len(ref) + variant.POS - 1  # Advance along ref_seq
                                              # -1 because POS is 1-indexed, we are 0-indexed internally
    if len(alt) > 0:
      if len(ref) > 0:
        f_vseq_pos.write(struct.pack('I', variant.POS))  # The original base
      if len(alt) > 1:
        f_vseq_pos.write(struct.pack('{:d}I'.format(len(alt) - len(ref)), *[new_ref_pos + 1] * (len(alt) - len(ref))))
  else:
    new_ref_pos = variant.POS - 1  # Keep us here - we didn't implement this variant and we should keep copying
                                   # -1 because POS is 1-indexed, we are 0-indexed internally
  return new_ref_pos

There were several functions like this. I was refactoring this code and when I would work on these functions I would spend minutes trying to find the actual body of the function, often puzzling over what turned out to be a doctest and not the actual code. I had been warned about this by well meaning relatives.

Doctests are awesome when you can combine a test and example and it is pithy. When you need to setup complex data structures or you need to test lots of use cases, it is better to put tests in a separate file, in a separate directory and run them with nose.

Doctests can still be used in the documentation as examples of using functions with the added benefit that the examples are kept upto date by being tested.







No comments:

Post a Comment