Skip to content

Latest commit



271 lines (256 loc) · 9.83 KB

File metadata and controls

271 lines (256 loc) · 9.83 KB

Generic Transcription Conventions (DRAFT!)

Bernhard Fisseni, Thomas Schmidt


This describes the generic conventions for transcripts of spoken language that are supported by both the IDS TEILicht web services and this library. They are applied in the segmentize step of processing.

Specification and Examples

  1. The segmentation works on un-analysed <u> elements in a TEI-ISO transcript document, as it can be produced from the Simple EXMARaLDA format.

    Utterances <u> that contain anything besides text content and time <anchor>s are not analysed. Comments and processing instructions inside <u> will be removed. Whitespace will be normalized.

  2. An utterance <u> is mainly split into words that will be annotated as <w> elements and punctuation, which will become pc elements.

  3. Uncertain content is placed in single parentheses; parentheses are allowed to contain incomprehensible content.

    TIM: (Hallo) Tim!
    TOM: (++++++ Tom!)

    It is not practical to use parentheses within parentheses.

  4. Pauses are indicated by full stops in parentheses. Pause lengths are determined by the number of full stops (short, medium, long, very+ long).

    TIM: Hallo (...) Wim!

Example document

The example document from the Simple EXMARaLDA format documentation is parsed as follows:

<?xml version="1.0" encoding="UTF-8"?><TEI xmlns="" xmlns:xs="">
            <person id="KIM" n="KIM">
            <person id="TIM" n="TIM">
            <person id="TIMOTHEUS" n="TIMOTHEUS">
            <person id="TOM" n="TOM">
            <person id="WIM" n="WIM">
            <person id="tom" n="tom">
            <application ident="IDS_TEILicht" version="0.1">
               <label>IDS TEILicht</label>
               <desc>TEI Conversion Webservices</desc>
         <transcriptionDesc ident="generic" version="2018">
          <!--Fill me in-->
          <!--Fill me in-->
         <change when="2018-11-14T10:57:57.294Z">created from Simple EXMARaLDA plain text transcript; language set to «deu»</change>
      <change when="2018-11-14T10:58:00.907Z">segmented according to generic transcription conventions</change></revisionDesc>
   <text lang="deu">
      <timeline unit="ORDER">
         <when id="B_1"/>
         <when id="E_1"/>
         <when id="B_2"/>
         <when id="E_2"/>
         <when id="B_3"/>
         <when id="E_3"/>
         <when id="B_4"/>
         <when id="E_4"/>
         <when id="B_5"/>
         <when id="E_5"/>
         <when id="B_6"/>
         <when id="E_6"/>
         <when id="B_7"/>
         <when id="E_7"/>
         <when id="B_8"/>
         <when id="E_8"/>
         <when id="B_9"/>
         <when id="E_9"/>
         <when id="B_10"/>
         <when id="E_10"/>
         <when id="B_11"/>
         <when id="E_11"/>
         <when id="B_12"/>
         <when id="E_12"/>
         <when id="B_13"/>
         <when id="E_13"/>
         <when id="B_14"/>
         <when id="E_14"/>
         <when id="B_15"/>
         <when id="E_15"/>
         <when id="B_16"/>
         <when id="E_16"/>
         <when id="B_17"/>
         <when id="E_17"/>
         <when id="B_18"/>
         <when id="E_18"/>
         <when id="B_19"/>
         <when id="E_19"/>
         <when id="B_20"/>
         <when id="E_20"/>
         <when id="B_21"/>
         <when id="E_21"/>
         <when id="B_22"/>
         <when id="E_22"/>
         <when id="B_23"/>
         <!--marked as ‹1› in the input.-->
         <when id="M_1"/>
         <when id="ME_1"/>
         <when id="E_24"/>
         <annotationBlock from="#B_1" to="#E_1" who="TOM">
         <annotationBlock from="#B_2" to="#E_2" who="tom">
         <annotationBlock from="#B_3" to="#E_3" who="TIM">
         <annotationBlock from="#B_4" to="#E_4" who="TOM">
         <annotationBlock from="#B_5" to="#E_5" who="tom">
         <annotationBlock from="#B_6" to="#E_6" who="TOM">
         <annotationBlock from="#B_7" to="#E_7" who="TIM">
         <annotationBlock from="#B_8" to="#E_8" who="TOM">
         <annotationBlock from="#B_9" to="#E_9" who="TIMOTHEUS">
         <annotationBlock from="#B_10" to="#E_10" who="TOM">
         <annotationBlock from="#B_11" to="#E_11" who="TOM">
         <incident end="#E_12" start="#B_12">
         <annotationBlock from="#B_12" to="#E_12" who="TOM">
         <incident end="#E_13" start="#B_13">
         <annotationBlock from="#B_13" to="#E_13" who="TIM">
         <annotationBlock from="#B_14" to="#E_14" who="KIM">
            <u><!--This node was not parsed, as it contains mixed content.-->
               <incident end="#E_14" start="#B_14">
         <incident end="#E_15" start="#B_15">
         <annotationBlock from="#B_16" to="#E_16" who="TIM">
            <u><w dur="2 syl" type="incomprehensible"/><w dur="1 syl" type="incomprehensible"/><pc>!</pc></u>
         <annotationBlock from="#B_17" to="#E_17" who="TOM">
            <u><w dur="2 syl" type="incomprehensible"/><w dur="1 syl" type="incomprehensible"/><pc>!</pc></u>
         <annotationBlock from="#B_18" to="#E_18" who="TIM">
         <annotationBlock from="#B_19" to="#E_19" who="TOM">
            <u><w dur="2 syl" type="incomprehensible"/><unclear><w>Tom</w><pc>!</pc></unclear></u>
         <annotationBlock from="#B_20" to="#E_20" who="TIM">
            <u><w>Hallo</w><pause type="long"/><w>Wim</w><pc>!</pc></u>
         <incident end="#E_21" start="#B_21">
         <annotationBlock from="#B_21" to="#E_21" who="TOM">
               <span from="#B_21" to="#E_21" type="comment">Salut, Tim!</span>
         <incident end="#E_22" start="#B_22">
         <annotationBlock from="#B_22" to="#E_22" who="TIM">
               <span from="#B_22" to="#E_22" type="comment">Salut, Tom!</span>
         <incident end="#E_23" start="#B_23">
         <annotationBlock from="#B_23" to="#ME_1" who="TOM">
            <u><w>Hallo</w><pc>,</pc><w><anchor synch="#M_1"/>Tim</w><pc>!</pc></u>
               <span from="#B_23" to="#ME_1" type="comment">Salut, Tim!</span>
         <incident end="#E_24" start="#B_24">
         <annotationBlock from="M_1" to="#E_24" who="TIM">
            <u><w>Hallo<anchor synch="#ME_1"/></w><pc>,</pc><w>Tom</w><pc>.</pc></u>
               <span from="#M_1" to="#E_24" type="comment">Salut, Tom!</span>