Skip to main content

Doctests or nosetests?

I had a torrid love affair with doctests. It was AMAZING! You mean you can write documentation AND test your code at the same time!? You can write a line of code, put down the answer next to it and the computer WILL CHECK IT FOR YOU?

Well, this went on for several months. Then one morning I woke up next to this:

def handle_variant(variant, f_vseq, f_vseq_pos, strand_no=None):
  """Write the variant to the mutated sequence and fill out the pos array

  Some setup
  >>> import io, vcf, struct
  >>> def add_GT(v, gt):
  ...   md=vcf.model.make_calldata_tuple('GT')(gt)
  ...   call=vcf.model._Call(v,'sample',md)
  ...   v.samples = [call]
  ...   v._sample_indexes = {'sample': 0}

  Test with SNP: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('T')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with SNP: homo, strand 0
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with SNP: homo, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)



  Test with SNP: het, strand 0
  >>> add_GT(variant, '0/1'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 0)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0

  Test with SNP: het, strand 1
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  T
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)


  Test with delete: ignore zygosity
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'CTG'; \
  alt = [vcf.model._Substitution('C')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  add_GT(variant, '1/0'); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  4
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  C
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('I',f_vseq_pos.read(4))
  (2,)

  Test with same delete, strand 1 (same as REF)
  >>> f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos, 1)
  1
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with delete and .
  >>> ref_seq = 'ACTGACTG'; \
  pos = 4; \
  ref = 'GACTG'; \
  alt = [vcf.model._Substitution('.')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  
  >>> _ = f_vseq_pos.seek(0,2); print f_vseq.tell()
  0


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 2; \
  ref = 'C'; \
  alt = [vcf.model._Substitution('CGGG')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  2
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  CGGG
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (2, 3, 3, 3)


  Test with insert
  >>> ref_seq = 'ACTGACTG'; \
  pos = 8; \
  ref = 'G'; \
  alt = [vcf.model._Substitution('GTTT')]; \
  variant = vcf.model._Record('1', pos, '.', ref, alt, 100, None, None, None, None, None); \
  f_vseq = io.BytesIO(); \
  f_vseq_pos = io.BytesIO(); \
  print handle_variant(variant, f_vseq, f_vseq_pos)
  8
  >>> _ = f_vseq.seek(0); print f_vseq.read()
  GTTT
  >>> _ = f_vseq_pos.seek(0); print struct.unpack('4I',f_vseq_pos.read(4*4))
  (8, 9, 9, 9)

  See Readme.md for details of algorithm
  """
  var_type = '1' if strand_no is None else variant.genotype('sample').data.GT.split('/')[strand_no]
  # 0 means REF 1 means ALT. If we don't specify a strand number it means we don't care about Zygosity and will always
  # take the ALT
  if var_type == '1':
    alt = variant.ALT[0].sequence
    if alt == '.': alt = ''
    ref = variant.REF
    if ref == '.': ref = ''
    f_vseq.write(alt)    # Copy over ALT
    new_ref_pos = len(ref) + variant.POS - 1  # Advance along ref_seq
                                              # -1 because POS is 1-indexed, we are 0-indexed internally
    if len(alt) > 0:
      if len(ref) > 0:
        f_vseq_pos.write(struct.pack('I', variant.POS))  # The original base
      if len(alt) > 1:
        f_vseq_pos.write(struct.pack('{:d}I'.format(len(alt) - len(ref)), *[new_ref_pos + 1] * (len(alt) - len(ref))))
  else:
    new_ref_pos = variant.POS - 1  # Keep us here - we didn't implement this variant and we should keep copying
                                   # -1 because POS is 1-indexed, we are 0-indexed internally
  return new_ref_pos

There were several functions like this. I was refactoring this code and when I would work on these functions I would spend minutes trying to find the actual body of the function, often puzzling over what turned out to be a doctest and not the actual code. I had been warned about this by well meaning relatives.

Doctests are awesome when you can combine a test and example and it is pithy. When you need to setup complex data structures or you need to test lots of use cases, it is better to put tests in a separate file, in a separate directory and run them with nose.

Doctests can still be used in the documentation as examples of using functions with the added benefit that the examples are kept upto date by being tested.







Comments

Popular posts from this blog

Flowing text in inkscape (Poster making)

You can flow text into arbitrary shapes in inkscape. (From a hint here).

You simply create a text box, type your text into it, create a frame with some drawing tool, select both the text box and the frame (click and shift) and then go to text->flow into frame.

UPDATE:

The omnipresent anonymous asked:
Trying to enter sentence so that text forms the number three...any ideas?
The solution:
Type '3' using the text toolConvert to path using object->pathSize as necessaryRemove fillUngroupType in actual text in new text boxSelect the text and the '3' pathFlow the text

Drawing circles using matplotlib

Use the pylab.Circle command

import pylab #Imports matplotlib and a host of other useful modules cir1 = pylab.Circle((0,0), radius=0.75, fc='y') #Creates a patch that looks like a circle (fc= face color) cir2 = pylab.Circle((.5,.5), radius=0.25, alpha =.2, fc='b') #Repeat (alpha=.2 means make it very translucent) ax = pylab.axes(aspect=1) #Creates empty axes (aspect=1 means scale things so that circles look like circles) ax.add_patch(cir1) #Grab the current axes, add the patch to it ax.add_patch(cir2) #Repeat pylab.show()

Pandas panel = collection of tables/data frames aligned by index and column

Pandas panel provides a nice way to collect related data frames together while maintaining correspondence between the index and column values:


import pandas as pd, pylab #Full dimensions of a slice of our panel index = ['1','2','3','4'] #major_index columns = ['a','b','c'] #minor_index df = pd.DataFrame(pylab.randn(4,3),columns=columns,index=index) #A full slice of the panel df2 = pd.DataFrame(pylab.randn(3,2),columns=['a','c'],index=['1','3','4']) #A partial slice df3 = pd.DataFrame(pylab.randn(2,2),columns=['a','b'],index=['2','4']) #Another partial slice df4 = pd.DataFrame(pylab.randn(2,2),columns=['d','e'],index=['5','6']) #Partial slice with a new column and index pn = pd.Panel({'A': df}) pn['B'] = df2 pn['C'] = df3 pn['D'] = df4 for key in pn.items: print pn[key] -> output …