Some fixes
authorKai Blin <kai@samba.org>
Sat, 14 Jan 2012 14:26:28 +0000 (01:26 +1100)
committerKai Blin <kai@samba.org>
Sat, 14 Jan 2012 14:26:28 +0000 (01:26 +1100)
antismash_lca2011.html
degenerate_code.svg [new file with mode: 0644]
drawings/languages_used.odg
script.txt
sequence_annotated.svg

index 66985c2..4320754 100644 (file)
   <div class="slide" id="antismash-find-orfs-all"></div>
   </section>
 
+  <section class="slide" id="antismash-why-aa-seqs">
+  <h2>Why Use Amino Acid Sequences?</h2>
+  <img src="degenerate_code.svg">
+  </section>
+
   <section class="slide" id="antismash-cluster-identification">
   <h2>Cluster Identification</h2>
   </section>
diff --git a/degenerate_code.svg b/degenerate_code.svg
new file mode 100644 (file)
index 0000000..f13e58c
--- /dev/null
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="616.91821"
+   height="184.57031"
+   id="svg2816"
+   version="1.1"
+   inkscape:version="0.47 r22583"
+   sodipodi:docname="New document 2">
+  <defs
+     id="defs2818">
+    <inkscape:perspective
+       sodipodi:type="inkscape:persp3d"
+       inkscape:vp_x="0 : 300 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_z="800 : 300 : 1"
+       inkscape:persp3d-origin="400 : 200 : 1"
+       id="perspective2824" />
+    <inkscape:perspective
+       id="perspective2842"
+       inkscape:persp3d-origin="0.5 : 0.33333333 : 1"
+       inkscape:vp_z="1 : 0.5 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_x="0 : 0.5 : 1"
+       sodipodi:type="inkscape:persp3d" />
+    <inkscape:perspective
+       id="perspective2842-8"
+       inkscape:persp3d-origin="0.5 : 0.33333333 : 1"
+       inkscape:vp_z="1 : 0.5 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_x="0 : 0.5 : 1"
+       sodipodi:type="inkscape:persp3d" />
+    <inkscape:perspective
+       id="perspective2842-7"
+       inkscape:persp3d-origin="0.5 : 0.33333333 : 1"
+       inkscape:vp_z="1 : 0.5 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_x="0 : 0.5 : 1"
+       sodipodi:type="inkscape:persp3d" />
+    <inkscape:perspective
+       id="perspective2889"
+       inkscape:persp3d-origin="0.5 : 0.33333333 : 1"
+       inkscape:vp_z="1 : 0.5 : 1"
+       inkscape:vp_y="0 : 1000 : 0"
+       inkscape:vp_x="0 : 0.5 : 1"
+       sodipodi:type="inkscape:persp3d" />
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.84852814"
+     inkscape:cx="360.68694"
+     inkscape:cy="136.38418"
+     inkscape:current-layer="layer1"
+     inkscape:document-units="px"
+     showgrid="false"
+     inkscape:window-width="1067"
+     inkscape:window-height="826"
+     inkscape:window-x="217"
+     inkscape:window-y="147"
+     inkscape:window-maximized="0" />
+  <metadata
+     id="metadata2821">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     id="layer1"
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     transform="translate(-32.135033,-92.136459)">
+    <text
+       xml:space="preserve"
+       style="font-size:400px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans;-inkscape-font-specification:DejaVu Sans"
+       x="447.15079"
+       y="244.08958"
+       id="text2830"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan2832"
+         x="447.15079"
+         y="244.08958"
+         style="font-size:200px;font-style:normal;font-variant:normal;font-weight:200;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:DejaVu Sans;-inkscape-font-specification:DejaVu Sans Ultra-Light">}</tspan></text>
+    <g
+       id="g2908"
+       transform="translate(0,-7.5595856)">
+      <text
+         id="text2826"
+         y="142.73761"
+         x="31.556908"
+         style="font-size:32px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans Mono;-inkscape-font-specification:DejaVu Sans Mono"
+         xml:space="preserve"><tspan
+           y="142.73761"
+           x="31.556908"
+           id="tspan2828"
+           sodipodi:role="line">ATGGCGGGGATTTGCTGA</tspan></text>
+      <text
+         id="text2826-3"
+         y="188.42865"
+         x="31.556908"
+         style="font-size:32px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans Mono;-inkscape-font-specification:DejaVu Sans Mono"
+         xml:space="preserve"><tspan
+           y="188.42865"
+           x="31.556908"
+           id="tspan2828-3"
+           sodipodi:role="line">ATGGCCGGTATATGTTAA</tspan></text>
+      <text
+         id="text2826-0"
+         y="234.11969"
+         x="31.556908"
+         style="font-size:32px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans Mono;-inkscape-font-specification:DejaVu Sans Mono"
+         xml:space="preserve"><tspan
+           y="234.11969"
+           x="31.556908"
+           id="tspan2828-4"
+           sodipodi:role="line">GTGGCTGGAATCTGCTAG</tspan></text>
+      <text
+         id="text2826-6"
+         y="264.97479"
+         x="204.82645"
+         style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans Mono;-inkscape-font-specification:DejaVu Sans Mono"
+         xml:space="preserve"><tspan
+           y="264.97479"
+           x="204.82645"
+           id="tspan2828-8"
+           sodipodi:role="line">...</tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:DejaVu Sans;-inkscape-font-specification:DejaVu Sans"
+       x="507.90091"
+       y="198.98216"
+       id="text2826-8"><tspan
+         sodipodi:role="line"
+         id="tspan2828-5"
+         x="507.90091"
+         y="198.98216"
+         style="font-size:40px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:DejaVu Sans Mono;-inkscape-font-specification:DejaVu Sans Mono">MAGIC*</tspan></text>
+  </g>
+</svg>
index ac17aae..d79259b 100644 (file)
Binary files a/drawings/languages_used.odg and b/drawings/languages_used.odg differ
index 83b65d8..d0c858b 100644 (file)
@@ -277,18 +277,33 @@ identification step. Here we start with identifying possible open reading
 frames. An open reading frame is something that looks like a gene, but hasn't
 been confirmed to produce anything experimentally. We call this an open reading
 frame because a ribosome could read in 3-letter steps from a start codon, to a
-stop codon later in the sequence. The exact process is a bit complicated, but
-it's basically a fancy way of doing the following: Look for a start tag (ATG or
-GTG) and then look for a stop codon downstream. Now, take all those hits and
-combine them in a way that gives you the maximal number of long genes. This
-heuristic turns out to work well for the data we're seeing.
-
-We tried a couple of implementations for this to optimize for speed and
-accuracy, but in the end we settled for using the preexisting "Glimmer" tool.
+stop codon later in the sequence.
+
+Let's do an example. Take this sequence here. First, we find all start codons.
+We're looking for ATG and GTG. I'll leave this on for a couple of seconds so
+you can have a go. Humans are actually pretty good at pattern matching, so you
+can actually train yourself to preform pretty well at this task. Ok, so those
+are the ones I found. Now, let's repeat the process for stop codons. Here,
+we're looking for TGA, TAG and TAA. Again, I found those. Now comes the fun
+part. We need to find matching start/stop codons that have a distance that's a
+multiple of three. The two easy ones on this slide are these two. This just
+leaves us with the problematic choice between the two alternatives down here.
+The TAG matches to the GTG here, and the TAA matches to the ATG here. It's a
+bit hard to decide which of these to take. As a rule of thumb, the longer the
+better, but in this case that doesn't really help much. The usual trick for the
+hard cases is to take the ones that look more like the other ORFs you found on
+the sequence. Figuring out how do deal with conflicts like that is where most
+gene finding tools differ. We tried a couple of implementations to optimize for
+speed and accuracy, but in the end we settled for using the preexisting
+"Glimmer" tool, which is available under the Artistic License.
 
 Now that we've found genes, we need to identify interesting gene clusters, that
 is, gene clusters related to secondary metabolites. We do this by building up
-profiles of known examples of secondary metabolite genes.
+profiles of known secondary metabolite genes. There's a catch however. Remember
+the degenerate code that mapped codons to amino acids? For many amino acids,
+there's more than one way to encode it. To avoid this problem, you usually work
+on the amino acid sequence. This has the added benefit that you no longer need
+to count to three all the time, it's one letter, one amino acid.
 
 Now that we've identified interesting gene clusters, we compare them gene by
 gene with other know secondary metabolite clusters.
index 8b7809b..b8ff200 100644 (file)
    style="fill:#4682b4"
    id="tspan3380">CAG</tspan>CGA<tspan
    style="fill:#4682b4"
-   id="tspan3382">TTG</tspan>CA</tspan><tspan
+   id="tspan3382">TTG</tspan>CAG</tspan><tspan
    style="fill:#ff4500"
-   id="tspan3332">TAG</tspan>CGGGGCGTCGCTCTCC<tspan
+   id="tspan3332">TAG</tspan>GGGGCGTCGCTCTCC<tspan
    style="fill:#00ff7f"
    id="tspan3249">GTG</tspan></tspan><tspan
          sodipodi:role="line"