National Center for Biotechnology Information

National Library of Medicine

National Institutes of Health, Building 38A

8600 Rockville Pike

Bethesda, MD  20984

301-496-2475         FAX 301-480-9241

 

 

NCBI Software Development ToolKit

Version 1.9 - August 1, 1994

 

 

Draft Copy

This documentation is always incomplete and under revision.


NCBI Software Development ToolKit

Short Table of Contents

 

Full Table of Contents........................................................................................................................................... 1

Overview................................................................................................................................................................... 9

Data Model............................................................................................................................................................... 19

CoreLib: Portable Core Library.......................................................................................................................... 31

AsnLib: ASN.1 Processing................................................................................................................................... 49

General Use Objects.............................................................................................................................................. 81

Bibliographic References..................................................................................................................................... 91

MEDLINE Data....................................................................................................................................................... 109

Biological Sequences............................................................................................................................................. 115

Collections of Sequences..................................................................................................................................... 159

Sequence Locations and Identifiers................................................................................................................. 167

Sequence Features................................................................................................................................................. 185

Sequence Alignments........................................................................................................................................... 215

Sequence Graphs................................................................................................................................................... 225

Sequence Utilities................................................................................................................................................... 229

Entrez Data Access................................................................................................................................................. 243

Vibrant User Interface Tools.............................................................................................................................. 257


Full Table of Contents

Full Table of Contents...........................................................................................................................................

Overview...................................................................................................................................................................

Introduction...............................................................................................................................................

Components Of The Software Development ToolKit....................................................................

ASN.1............................................................................................................................................

Data Model For Biological Sequences....................................................................................

CoreLib: Writing Portable Software......................................................................................

AsnLib: Reading and Writing ASN.1......................................................................................

Object Loaders: Combining AsnLib and the Data Model...................................................

Utilities........................................................................................................................................

Data Access..................................................................................................................................

Vibrant: A Portable Windowing System...............................................................................

A Few Samples.........................................................................................................................................

Using This Document.............................................................................................................................

Contacting NCBI......................................................................................................................................

Data Model...............................................................................................................................................................

Introduction...............................................................................................................................................

Biological Sequences...............................................................................................................................

Classes of Biological Sequences..........................................................................................................

Locations on Biological Sequences.....................................................................................................

Associating Annotation With Locations On Biological Sequences..........................................

Feature Tables.............................................................................................................................

Sequence Alignments................................................................................................................

Sequence Graph..........................................................................................................................

Collections of Related Biological Sequences....................................................................................

Consequences of the Data Model........................................................................................................

CoreLib: Portable Core Library..........................................................................................................................

Introduction...............................................................................................................................................

Application Frameworks.......................................................................................................................

Main Entry Point........................................................................................................................

Getting Program Arguments...................................................................................................

User Interface Elements..........................................................................................................................

Alerts............................................................................................................................................

Beeps.............................................................................................................................................

Monitors......................................................................................................................................

Configuration Files..................................................................................................................................

File Names..................................................................................................................................

File Format..................................................................................................................................

Configuration File Functions..................................................................................................

Error Processing.......................................................................................................................................

Posting An Error........................................................................................................................

User Error Strings......................................................................................................................

Customization............................................................................................................................

Configuration File Settings......................................................................................................

Preparing Error Message Files................................................................................................

Fetching and Displaying Errors..............................................................................................

Installing Custom Error Handlers..........................................................................................

Miscellaneous Utility Functions..............................................................................................

Files and Directories...............................................................................................................................

ANSI-Style Functions................................................................................................................

Directory Management.............................................................................................................

CD-ROM......................................................................................................................................

Customization............................................................................................................................

Memory Management.............................................................................................................................

ANSI-Style Functions................................................................................................................

Fixed Memory............................................................................................................................

Relocatable Memory.................................................................................................................

Byte Stores..................................................................................................................................................

String Functions.......................................................................................................................................

ANSI-Style Functions................................................................................................................

Additional String Functions.....................................................................................................

Number Strings..........................................................................................................................

Time Strings................................................................................................................................

SGML Strings..............................................................................................................................

ValNode Functions.................................................................................................................................

Math Functions........................................................................................................................................

Macros..........................................................................................................................................

Arithmatic Functions.................................................................................................................

Transendental Functions...........................................................................................................

Gamma Functions......................................................................................................................

Advanced Functions..................................................................................................................

Miscellaneous Utilities...........................................................................................................................

Macros..........................................................................................................................................

Random Numbers......................................................................................................................

Sorting..........................................................................................................................................

Time..............................................................................................................................................

Process ID....................................................................................................................................

Application Properties..............................................................................................................

Debugging Macros.....................................................................................................................

Portability Issues......................................................................................................................................

Portable Types............................................................................................................................

Integral Types...........................................................................................................

Floating-point Types...............................................................................................

Pointer Types............................................................................................................

Avoiding Name Collisions...................................................................................

Byte Order...................................................................................................................................

Function Prototypes..................................................................................................................

AsnLib: ASN.1 Processing...................................................................................................................................

Introduction to ASN.1............................................................................................................................

Why ASN.1..................................................................................................................................

Structure of ASN.1......................................................................................................................

Further information about ASN.1...........................................................................................

AsnLib: Overview....................................................................................................................................

Principles of Operation..........................................................................................................................

Specification for AsnLib........................................................................................................................

AsnTool......................................................................................................................................................

AsnTool Tutorial......................................................................................................................................

Using AsnLib............................................................................................................................................

AsnLib: A Tutorial...................................................................................................................................

getmesh.c.....................................................................................................................................

indexpub.c...................................................................................................................................

getpub.c........................................................................................................................................

Data-links...................................................................................................................................................

AsnLib Generated Header Files..........................................................................................................

Returns From AsnLib Parsing.............................................................................................................

Finding AsnTypePtrs at Run-time......................................................................................................

Custom Read and Write Functions....................................................................................................

Customizing an AsnIo Stream.............................................................................................................

ASN.1 Object Loaders.............................................................................................................................

AsnLib and Object Loaders As a Generalized Iterator.................................................................

AsnLib and Object Loaders Provide a Generalized Copy and Compare................................

AsnLib Interface: asn.h..........................................................................................................................

General Use Objects..............................................................................................................................................

Introduction...............................................................................................................................................

Large Text Blocks: StringStore..............................................................................................................

The Date......................................................................................................................................................

Identifying Things: Object-id................................................................................................................

Identifying Things: Dbtag.....................................................................................................................

Identifying People: Person-id...............................................................................................................

Expressing Uncertainty with Fuzzy Integers: Int-fuzz.................................................................

Creating Your Own Objects: User-object...........................................................................................

ASN.1 Specification: general.asn........................................................................................................

C Structures and Functions: objgen.h................................................................................................

Bibliographic References.....................................................................................................................................

Introduction...............................................................................................................................................

Citation Components: Affiliation........................................................................................................

Citation Components: Authors............................................................................................................

Citation Components: Imprint.............................................................................................................

Citation Components: Title...................................................................................................................

Citing an Article.......................................................................................................................................

Citing a Journal........................................................................................................................................

Citing a Book.............................................................................................................................................

Citing a Proceedings...............................................................................................................................

Citing a Letter, Manuscript, or Thesis...............................................................................................

Citing Directly Submitted Data............................................................................................................

Citing a Patent..........................................................................................................................................

Identifying a Patent.................................................................................................................................

Citing an Article or Book which is In Press.....................................................................................

Special Cases: Unpublished, Unparsed, or Unusual....................................................................

Accommodating Any Publication Type............................................................................................

Grouping Different Forms of Citation for a Single Work.............................................................

Sets of Citations........................................................................................................................................

Comparing Citations..............................................................................................................................

ASN.1 Specification: biblio.asn...........................................................................................................

C Structures and Functions: objbibli.h..............................................................................................

ASN.1 Specification: pub.asn...............................................................................................................

C Structures and Functions: objpub.h...............................................................................................

MEDLINE Data.......................................................................................................................................................

Introduction...............................................................................................................................................

Structure of a MEDLINE Entry............................................................................................................

MeSH Index Terms..................................................................................................................................

Substance Records...................................................................................................................................

Database Cross Reference Records.....................................................................................................

Funding Identifiers..................................................................................................................................

Gene Symbols............................................................................................................................................

ASN.1 Specification: medline.asn.......................................................................................................

C Structures and Functions: objmedli.h............................................................................................

Biological Sequences.............................................................................................................................................

Introduction...............................................................................................................................................

Bioseq: the Biological Sequence...........................................................................................................

Seq-id: Identifying the Bioseq...............................................................................................................

Seq-annot: Annotating the Bioseq.......................................................................................................

Seq-descr: Describing the Bioseq and Placing It In Context........................................................

mol-type: The Molecule Type..................................................................................................

modif: Modifying Our Assumptions About a Bioseq..........................................................

method: Protein Sequencing Method.....................................................................................

name: A Descriptive Name......................................................................................................

title: A Descriptive Title............................................................................................................

org: What Organism Did this Come From?..........................................................................

comment: Commentary Text...................................................................................................

num: Applying a Numbering System to a Bioseq...............................................................

maploc: Map Location...............................................................................................................

pir: PIR Specific Data.................................................................................................................

sp: SWISSPROT Data..................................................................................................................

embl: EMBL Data........................................................................................................................

prf: PRF Data...............................................................................................................................

pdb: PDB Data.............................................................................................................................

genbank: GenBank Flatfile Specific Data...............................................................................

pub: Description of a Publication............................................................................................

region: Name of a Genomic Region.......................................................................................

user: A User-defined Structured Object..................................................................................

neighbors: Bioseqs Related by Sequence Similarity............................................................

create-date:..................................................................................................................................

update-date:.................................................................................................................................

het: Heterogen............................................................................................................................

Seq-inst: Instantiating the Bioseq........................................................................................................

Seq-inst: Virtual Bioseq.............................................................................................................

Seq-inst: Raw Bioseq..................................................................................................................

Seq-inst: Segmented Bioseq......................................................................................................

Seq-inst: Reference Bioseq........................................................................................................

Seq-inst: Constructed Bioseq....................................................................................................

Seq-inst: Typical or Consensus Bioseq...................................................................................

Seq-inst: Map Bioseqs................................................................................................................

Seq-hist: History of a Seq-inst...............................................................................................................

Seq-data: Encoding the Sequence Data Itself...................................................................................

IUPACaa: The IUPAC-IUB Encoding of Amino Acids.........................................................

NCBIeaa: Extended IUPAC Encoding of Amino Acids.......................................................

NCBIstdaa: A Simple Sequential Code for Amino Acids...................................................

NCBI8aa: An Encoding for Modified Amino Acids.............................................................

IUPAC3aa: A 3 Letter Display Code for Amino Acids........................................................

NCBIpaa: A Profile Style Encoding for Amino Acids.........................................................

IUPACna: The IUPAC-IUB Encoding for Nucleic Acids......................................................

NCBI4na: A Four Bit Encoding of Nucleic Acids..................................................................

NCBI2na: A Two Bit Encoding for Nucleic Acids.................................................................

NCBI8na: An Eight Bit Sequential Encoding for Modified Nucleic Acids.......................

NCBIpna: A Frequency Profile Encoding for Nucleic Acids..............................................

Tables of Sequence Codes......................................................................................................................

Mapping Between Different Sequence Alphabets..........................................................................

Data and Tools for Sequence Alphabets...........................................................................................

Pubdesc: Publication Describing a Bioseq.......................................................................................

Numbering: Applying a Numbering System to a Bioseq.............................................................

Num-cont: A Continuous Integer Numbering System.......................................................

Num-real: A Real Number Numbering Scheme..................................................................

Num-enum: An Enumerated Numbering Scheme...............................................................

Num-ref: Numbering by Reference to Another Bioseq......................................................

Numbering: C Structures and Utility Functions..................................................................

ASN.1 Specification: seq.asn................................................................................................................

ASN.1 Specification: seqblock.asn......................................................................................................

ASN.1 Specification: seqcode.asn.......................................................................................................

C Structures and Functions: objseq.h.................................................................................................

C Structures and Functions: objpubd.h.............................................................................................

C Structures and Functions: objblock.h.............................................................................................

C Structures and Functions: objcode.h..............................................................................................

Collections of Sequences.....................................................................................................................................

Introduction...............................................................................................................................................

Seq-entry: The Sequence Entry.............................................................................................................

Bioseq-set: A Set Of Seq-entrys.............................................................................................................

id: local identifier for this set...................................................................................................

coll: global identifier for this set.............................................................................................

level: nesting level of set..........................................................................................................

class: classification of sets.........................................................................................................

release: an explanatory string..................................................................................................

date:..............................................................................................................................................

descr: Seq-descr for this set.......................................................................................................

seq-set: the sequences and sets within the Bioseq-set..........................................................

annot: Seq-annots for the set....................................................................................................

Bioseq-sets are Convenient Packages................................................................................................

ASN.1 Specification: seqset.asn...........................................................................................................

C Structures and Functions: objsset.h................................................................................................

Sequence Locations and Identifiers.................................................................................................................

Introduction...............................................................................................................................................

Seq-id: Identifying Sequences...............................................................................................................

Seq-id: Semantics of Use........................................................................................................................

local: Privately Maintained Data.............................................................................................

other: A Local Textseq-id..........................................................................................................

general: Ids from Local Databases..........................................................................................

gibbsq, gibbmt: GenInfo Backbone Ids..................................................................................

genbank, embl, ddbj: The International Nucleic Acid Sequence Databases....................

pir: PIR International.................................................................................................................

swissprot: SWISS-PROT.............................................................................................................

prf: Protein Research Foundation...........................................................................................

patent: Citing a Patent...............................................................................................................

pdb: Citing a Biopolymer Chain from a Structure Database.............................................

giim: GenInfo Import Id...........................................................................................................

gi: A Stable, Uniform Id Applied to Sequences From All Sources....................................

Seq-id: The C Implementation..............................................................................................................

NCBI ID Database: Imposing Stable Seq-ids....................................................................................

Seq-loc: Locations on a Bioseq.............................................................................................................

null: A Gap..................................................................................................................................

empty: A Gap in an Alignment...............................................................................................

whole: A Reference to a Whole Bioseq..................................................................................

int: An Interval on a Bioseq......................................................................................................

packed-int: A Series of Intervals..............................................................................................

pnt: A Single Point on a Sequence...........................................................................................

packed-pnt: A Collection of Points.........................................................................................

mix: An Arbitrarily Complex Location.................................................................................

equiv: Equivalent Locations.....................................................................................................

bond: A Chemical Bond Between Two Residues..................................................................

feat: A Location Indirectly Referenced Through A Feature................................................

Seq-loc: The C Implementation............................................................................................................

ASN.1 Specification: seqloc.asn..........................................................................................................

C Structures and Functions: objloc.h.................................................................................................

Sequence Features.................................................................................................................................................

Introduction...............................................................................................................................................

Seq-feat: Structure of a Feature.............................................................................................................

id: Features Can Have Identifiers...........................................................................................

data: Structured Data Makes Feature Types Unique............................................................

partial: This Feature is Incomplete.........................................................................................

except: There is Something Biologically Exceptional..........................................................

comment: A Comment About This Feature..........................................................................

product: Does This Feature Produce Another Bioseq?........................................................

location: Source Location of This Feature..............................................................................

qual: GenBank Style Qualifiers................................................................................................

title: A User Defined Name......................................................................................................

ext: A User Defined Structured Extension..............................................................................

cit: Citations For This Feature.................................................................................................

exp-ev: Experimental Evidence...............................................................................................

xref: Linking To Other Features..............................................................................................

SeqFeatData: Type Specific Feature Data..........................................................................................

gene: Location Of A Gene.........................................................................................................

org: Source Organism Of The Bioseq......................................................................................

cdregion: Coding Region.........................................................................................................

prot: Describing A Protein.......................................................................................................

rna: Describing An RNA...........................................................................................................

pub: Publication About A Bioseq Region..............................................................................

seq: Tracking Original Sequence Sources..............................................................................

imp: Importing Features From Other Data Models.............................................................

region: A Named Region..........................................................................................................

comment: A Comment On A Region Of Sequence..............................................................

bond: A Bond Between Residues.............................................................................................

site: A Defined Site.....................................................................................................................

rsite: A Restriction Enzyme Cut Site......................................................................................

user: A User Defined Feature...................................................................................................

txinit: Transcription Initiation.................................................................................................

num: Applying Custom Numbering To A Region..............................................................

psec-str: Protein Secondary Structure.....................................................................................

non-std-residue: Unusual Residues.........................................................................................

het: Heterogen............................................................................................................................

Seq-feat Implementation in C...............................................................................................................

CdRegion: Coding Region.....................................................................................................................

orf: Open Reading Frame.........................................................................................................

Translation Information...........................................................................................................

Problems With Translations....................................................................................................

Genetic Codes...........................................................................................................................................

C Implementation Of Genetic Codes.....................................................................................

Rsite-ref: Reference To A Restriction Enzyme.................................................................................

RNA-ref: Reference To An RNA..........................................................................................................

Gene-ref: Reference To A Gene.............................................................................................................

Prot-ref: Reference To A Protein...........................................................................................................

Txinit: Transcription Initiation............................................................................................................

Current Genetic Code Table: gc.prt.....................................................................................................

ASN.1 Specification: seqfeat.asn.........................................................................................................

C Structures and Functions: objfeat.h................................................................................................

Sequence Alignments...........................................................................................................................................

Introduction...............................................................................................................................................

Seq-align.....................................................................................................................................................

type: global.................................................................................................................................

type: partial.................................................................................................................................

type: diags...................................................................................................................................

dim: Dimensionality Of The Alignment................................................................................

Score: Score Of An Alignment Or Segment.......................................................................................

Dense-diag: Segments For "diags" Seq-align...................................................................................

Dense-seg: Segments for "global" or "partial" Seq-align...............................................................

Std-seg: Aligning Any Bioseq Type With Any Other....................................................................

ASN.1 Specification: seqalign.asn......................................................................................................

C Structures and Functions: objalign.h.............................................................................................

Sequence Graphs...................................................................................................................................................

Introduction...............................................................................................................................................

Seq-graph: Graph on a Bioseq..............................................................................................................

ASN.1 Specification: seqres.asn..........................................................................................................

C Structures and Functions: objres.h.................................................................................................

Sequence Utilities...................................................................................................................................................

Introduction...............................................................................................................................................

Demo: seqtest.c..........................................................................................................................................

Finding Features and Descriptors in an Entry................................................................................

Exploring an Object Using ASN.1 Defined Names.......................................................................

C Structures and Functions: sequtil.h................................................................................................

C Structures and Functions: seqport.h..............................................................................................

Entrez Data Access.................................................................................................................................................

Introduction...............................................................................................................................................

Connecting To and Disconnecting From Data Sources................................................................

Scanning the List of Available Terms................................................................................................

Obtaining the UID Given an Accession Number...........................................................................

Obtaining the UIDs That Satisfy a Boolean Query........................................................................

Loading a Sequence Record..................................................................................................................

Loading a MEDLINE Record...............................................................................................................

Streaming Through All of the Data Records....................................................................................

Converting to FASTA Format...............................................................................................................

Converting GenBank Format................................................................................................................

Converting to MEDLARS Format........................................................................................................

Loading a Document Summary...........................................................................................................

Loading a Set of Document Summaries............................................................................................

Retreiving Neighbors and Links.........................................................................................................

C Structures and Functions: accentr.h...............................................................................................

C Structures and Functions: casn.h....................................................................................................

Vibrant User Interface Tools..............................................................................................................................

Introduction...............................................................................................................................................

Programming Example..........................................................................................................................

Object Specification....................................................................................................................

Callback Functions....................................................................................................................

Reference....................................................................................................................................................

Object Data Types.......................................................................................................................

Callback Types...........................................................................................................................

General Global Variables.........................................................................................................

Window Objects.........................................................................................................................

Context Functions......................................................................................................................

Grouping Objects.......................................................................................................................

Button Objects.............................................................................................................................

List Objects..................................................................................................................................

Menu Objects...............................................................................................................................

Popup Object...............................................................................................................................

Prompt Object.............................................................................................................................

Text Objects.................................................................................................................................

Scroll Bar Object.........................................................................................................................

Slate and Panel Objects..............................................................................................................

Repeat Object..............................................................................................................................

Switch Object...............................................................................................................................

Icon Object...................................................................................................................................

Graphical Viewer Object...........................................................................................................

Doc Object....................................................................................................................................

Class Functions...........................................................................................................................

Miscellaneous Functions...........................................................................................................

Graphical Drawing Functions..................................................................................................

Index............................................................................................................................................................

Acknowledgments...................................................................................................................................

Trademarks................................................................................................................................................

 



Overview


Introduction
Components Of The Software Development ToolKit
A Few Samples
Using This Document
Contacting NCBI


 Introduction

Molecular biology is generating a host of data which are dramatically altering and deepening our understanding of the processes which underlie all living things. This new knowledge is already affecting medicine, agriculture, biotechnology, and basic science in fundamental and sweeping ways. However, the data on which our growing understanding is based is being accumulated and analyzed in thousands of laboratories all over the world, from large genome centers to small university laboratories, from large pharmaceutical companies to small biotech startups. It is being managed and analyzed on machines from small personal computers to supercomputers, on systems from a few disk files to large commercial database systems. These essential new data require specialized tools for analysis and management, so software tools are being developed in all these different environments at once.  Since molecular biology is an infant science, the data itself is not yet fully understood, so its fundamental properties and relationships are constantly being revised as well. Finally, the raw volume of molecular biology data is growing at an astonishing rate.

In recognition of the essential and growing role of bioinformatics in the United States, the National Center for Biotechnology Information (NCBI) was created by act of Congress in November 1988. This law mandates that NCBI shall:

 

1) Create automated systems for knowledge about molecular biology, biochemistry, and genetics.

2) Perform research into advanced methods of analyzing and interpreting molecular biology data.

3) Enable biotechnology researchers and medical care personnel to use the systems and methods developed.

4) Coordinate efforts to gather biotechnology information worldwide.

To approach these goals, NCBI has been organized into three interoperating branches. The Basic Research Branch (BRB) is a group of scientists who perform research into algorithms and methods for analyzing molecular biology data and publish results in peer reviewed journals, and keeps the other branches abreast of the latest developments from a scientific perspective. The Information Resources Branch (IRB) maintains the infrastructure at NCBI, administers the distribution of data and services provided by NCBI to the community, supports a visiting scientist program to enable researchers to spend time working at NCBI, and interacts with other agencies and bodies. The Information Engineering Branch (IEB) designs and builds databases and software tools for molecular biology information which attempt to incorporate the new approaches and meet the needs of the BRB, while producing data and software tools which are released to the community on a production basis by the IRB.

This document describes the data model and software tools developed by the IEB to achieve their mission. The IEB approaches its task with an understanding of the situation outlined in the first paragraph, that molecular biology data comes from and is used in an extremely heterogeneous, distributed, and changing environment, from both computing and biological points of view. The data processed and integrated by IEB will come from many different sources which may use different models of the data, which can be expected to change over time. The data will be stored and managed on many different computer systems using many different database management systems. The data itself is expected to be valuable for longer than the life cycle of any particular computer system or program. This means that the data must be described in a controlled and formal way, so that all participants can clearly understand what data components are available in common at any time, but without dependence on any particular software tool or language, database management system, or hardware architecture.

Software developed by IEB must be capable of running on all major hardware platforms used in the scientific community and must be designed to be ported to new systems as the computer industry progresses. It must be capable of providing systems for data retrieval by end-user scientists while also providing software hooks for other programs written by bioinformatics specialists in commercial, academic, or government settings, and by academic researchers.

To achieve the goal of a formal, controlled, yet flexible data specification, IEB has adopted the use of Abstract Syntax Notation 1 (ASN.1), and International Standards Organization standard (ISO 8824, 8825) for describing and encoding data in a machine readable way which is independent of hardware or software architecture and language. IEB has created a formal specification in ASN.1 for biotechnology and bibliographic information. This specification is based on a data model which unifies sequence related data from bands on a gel to genetic maps to sequenced nucleic acid and protein molecules. It provides connections from such data to other specialized datasets such as stock center lists, taxonomies, or structures. The specification is done as a series of connected modules. This means selected modules can be reused by other biotechnology databases and new ones added to meet specialized needs. The ASN.1 specification and encoding provide an essential common ground, changing the many to many mapping between the various information sources and applications to a many to one mapping, both for data models and for software interfaces.

To achieve the goals of software portability and of providing different levels of access from database producer to programmer to end-user, IEB has developed a layered software toolkit. The toolkit is used internally at NCBI to process and analyze data from a variety of sources to build and maintain the unified databases and also serves as the components for the end-user applications NCBI distributes. This means it is subjected to the continuous demands for quality and performance imposed by a large, production operation in the course of our daily work. The source code for the toolkit is made available without restriction for use by anyone wishing to take advantage of the work done by NCBI. The software runs on a wide variety of common platforms and is layered to allow programmers use both very low level or very high level tools to access and manipulate data.

Components Of The Software Development ToolKit

ASN.1

A brief introduction is provided to the ASN.1 language itself in the beginning of the AsnLib chapter. Those familiar with Backus-Naur form should have no trouble reading it immediately, while a short explanation may be required for others. It is a simple, logical way to specify data and is used for many purposes in the computer industry to describe and exchange data. A number of books, articles, and software tools from the computer industry at large are readily available for those who wish a more in-depth knowledge of ASN.1. This is an important aspect of choosing the ASN.1 language to describe biological data. ASN.1 is a formal data description language, developed, tested, and used within the computer industry, not an ad hoc file format developed by biologists. Would you program in an ad hoc programming language developed by biologists? Then why describe data that way?

Data Model For Biological Sequences

The selection of a data description language does not define what it is used for any more than the selection of English defines what a book is about. The IEB has defined a model for biotechnology information (which happens to be specified in ASN.1) which is centered around the concept of a biological sequence as a simple, linear coordinate system. Genetic and physical maps, sequenced pieces of nucleic acids and proteins, and complex assemblies of such components can all be considered specializations of the basic sequence concept of an identified coordinate system. Relationships between sequences (e.g. sequence alignments, sequence assemblies, relationships of genetic to physical maps) can all be considered mappings from one sequence coordinate system to another. Information about sequences can be considered mappings of specialized data objects (e.g. publications, genes, coding regions) to any sequence coordinate system. Such specialized data objects may themselves contain keys to other databases containing more specialized information not necessarily captured by the common data model, but unique to a particular organism, discipline, or database.

CoreLib: Writing Portable Software

The CoreLib is a small set of "C' language functions, macros, and guidelines that permit the writing of programs which compile and execute without change on over fourteen different hardware/operating system/compiler combinations. If one wishes to distribute one's code to as many molecular biologists as possible with as little work as possible, learning to write CoreLib style code is a tremendous advantage. If one wishes to write on one platform, but interface with NCBI software, one should still understand the CoreLib approach (read the introduction in the CoreLib chapter), but it does not require that one write CoreLib code oneself.

AsnLib: Reading and Writing ASN.1

AsnLib is a function library written with CoreLib, which provides functions for reading and validating ASN.1 specifications and generating parse trees to encoded and decode data conforming to the specification. The parse trees can be generating dynamically at run-time from any input specification, or parse trees for particular specifications can be produced as "C" language header files to be incorporated into applications. Given a parse tree generated either way, AsnLib provides low level functions for encoding and decoding data in either the text or binary forms of ASN.1, one element at a time. Converters to other languages (ASN.1 to Prolog or ASN.1 to LISP have been done), filters (get all journal titles from an ASN.1 encoded stream of bibliographic citations), or indexing programs (index a file of ASN.1 encoded bibliographic citations on author name) can be written with tools at this level.

Object Loaders: Combining AsnLib and the Data Model

Every ASN.1 specification module in the NCBI data model has a corressponding "object loader" module. This is a "C" language ".c" and ".h" file which typedef a "C" structure for every entity defined in ASN.1 (called an "object" here). For each object there is a function to create it, read it from an ASN.1 stream, write it to an ASN.1 stream, and free it. These take the form of [AsnName]New(), [AsnName]AsnRead(), [AsnName]AsnWrite(), and [AsnName]Free(). If an "object" is considered data associated with methods, these routines define the structure of the data (as mapped from ASN.1) and define routines to load such objects in and out of memory from ASN.1.

In some cases additional functions such as compare, duplicate, find, or print are defined here as well. The Data Access layer returns pointers to these structures and the Utilities layer provides more routines to compare, explore, manipulate, and display these structures. Using the object loader layer incorporates a great deal of NCBI code into your application, but most programmers find this the easiest level to access NCBI data for complex objects such as whole sequence entries.

In the following document detailed discussion of an ASN.1 module and its corresponding object loader are combined together in a single chapter. The chapters are organized by grouping closely related objects together. The discussion in each chapter focuses on particular issues surrounding the implementation of that data type but may not mention every function. The complete ASN.1 specification and object loader ".h" files follow at the end of each such chapter for the comprehensive and definitive specification.

Utilities

A growing number of utility functions have been written that manipulate or analyze the structures defined in the object loaders. For example, one function compares two (arbitrarily complex) locations on sequences and determines if they overlap or if one is contained in the other. Another opens a "port" on any (arbitrarily complex) sequence or part(s) of a sequence(s) and treat it as a single sequence, in any selected sequence alphabet, with operations provided such as "seek to location", "get next residue", "read x residues into a buffer", and so on. A whole family of functions allow the exploration of any arbitrarily complex structure in memory with a call to a user supplied function when encountering any structure based on it's ASN.1 name (e.g. find all coding region features, or find all publications, or find all author names in publications). Finally there are functions that will output a sequence entry in GenBank format, FASTA format, or a report format.

Data Access

A family of functions supplies high level access to sequence and bibliographic data on the Entrez:Sequences CDROM provided by NCBI. These functions allow the evaluation of Boolean operations on a list of terms, resulting the sequence ids (or MEDLINE ids) that satisfy the query. Other functions take sequence or MEDLINE id and retrieve the record from the CDROM, or retrieve its "neighbors", entries which are similar to it.

These same functions have been implemented as Internet network access functions to the NCBI data servers, and will become publicly available in 1993. Software which accesses data on the Entrez: Sequences CDROM using the access functions can be changed to access the network servers by just linking to a different library.

The access functions mean that a programmer can incorporate any or all of the functionality shown by the Entrez application into a program of their own design. This means customized analysis and retrieval systems can be written which nonetheless take advantage of the public data retrieval systems.

Vibrant: A Portable Windowing System

Vibrant is a portable windowing system written with CoreLib which allows windowing applications to be written which are source code identifical on Macintosh, MicroSoft Windows, UNIX X11 Motif and VMS X11 Motif. Vibrant is not meant to provide every possible tool supported by the host system or other commercial products, but rather to vastly simplify writing basic scientific applications which are compatible with the modern windowing environments widely used by scientists now in a portable way.

NCBI fondly hopes that eventually a standard windowing API or appropriate tools will emerge from the computer industry. We will only support Vibrant until that time. While we make it available to the public to use as desired, Vibrant is primarily aimed at serving internal NCBI needs.

A Few Samples

This document contains a large mass of detailed information and new ideas. Just as learning a new language, it is a substantial commitment to learn and understand it all. But knowing it all may not be necessary to get started. This is a quick sample of what is available to give you a flavor of what this is.

This is the ASN.1 definitions used for an article citation (from a book, journal, or proceedings.. only journal is shown). The "::=" means "is defined as" and SEQUENCE means "the following items come in order", not a biological sequence. You can probably just read the rest.

Cit-art ::= SEQUENCE {                  -- article in journal or book

    title Title OPTIONAL ,              -- title of paper (ANSI requires)

    authors Auth-list OPTIONAL ,        -- authors (ANSI requires)

    from CHOICE {                       -- journal or book

        journal Cit-jour ,

        book Cit-book ,

        proc Cit-proc } }

 

Cit-jour ::= SEQUENCE {             -- Journal citation

    title Title ,                   -- title of journal

    imp Imprint }

 

Auth-list ::= SEQUENCE {

        names CHOICE {

            std SEQUENCE OF Author ,        -- full citations

            ml SEQUENCE OF VisibleString ,  -- MEDLINE, semi-structured

            str SEQUENCE OF VisibleString } , -- free for all

        affil Affil OPTIONAL }        -- author affiliation

 

Title ::= SET OF CHOICE {

    name VisibleString ,    -- Title, Anal,Coll,Mono    AJB

    tsub VisibleString ,    -- Title, Subordinate       A B

    trans VisibleString ,   -- Title, Translated        AJB

    jta VisibleString ,     -- Title, Abbreviated        J

    iso-jta VisibleString , -- specifically ISO jta      J

    ml-jta VisibleString ,  -- specifically MEDLINE jta  J

    coden VisibleString ,   -- a coden                   J

    issn VisibleString ,    -- ISSN                      J

    abr VisibleString ,     -- Title, Abbreviated         B

    isbn VisibleString }    -- ISBN                       B

 

Imprint ::= SEQUENCE {                  -- Imprint group

    date Date ,                         -- date of publication

    volume VisibleString OPTIONAL ,

    issue VisibleString OPTIONAL ,

    pages VisibleString OPTIONAL ,

    section VisibleString OPTIONAL ,

    pub Affil OPTIONAL,                     -- publisher, required for book

    cprt Date OPTIONAL,                     -- copyright date, "    "   "

    part-sup VisibleString OPTIONAL ,       -- used in MEDLINE

    language VisibleString DEFAULT "ENG" ,  -- put here for simplicity

   prepub ENUMERATED {                     -- for prepublication citaions

       submitted (1) ,                     -- submitted, not accepted

       in-press (2) ,                    -- accepted, not published

       other (255)  } OPTIONAL }

 

That is a very complete and detailed specification but here is a sample of a journal citation in text form ASN.1. You can easily see how it conforms to the specification and how one would locate the journal title for example.

Cit-art ::= {

  title {

    name "Developmental regulation of a constitutively expressed mouse mRNA

 encoding a 72-kDa heat shock-like protein." } ,

  authors {

    names

      ml {

        "Giebel LB" ,

        "Dworniczak BP" ,

        "Bautz EK" } ,

    affil

      str "Zentrum fur Molekulare Biologie, Universitat Heidelberg (ZMBH),

 Federal Republic of Germany." } ,

  from

    journal {

      title {

        ml-jta "Dev Biol" } ,

      imp {

        date

          std {

            year 1988 ,

            month 1 } ,

        volume "125" ,

        issue "1" ,

        pages "200-7" } } }

Here is the object loader "C" structure and its attendant functions for a Cit-art. There is even a matching function for this object. Details of using the "fromptr" to access the CitJour, CitBook, or CitProc for the article are given in the Bibliographic References chapter. This is just to give the flavor.

/*****************************************************************************

*

*   Cit-art

*

*****************************************************************************/

typedef struct citart {

   ValNodePtr title;       /* choice[1]=name,[2]=tsub,[3]=trans */

   AuthListPtr authors;

   Uint1 from;             /* [1]=journal,[2]=book,[3]=proc */

   Pointer fromptr;

} CitArt, PNTR CitArtPtr;

 

extern CitArtPtr CitArtNew PROTO((void));

extern CitArtPtr CitArtFree PROTO((CitArtPtr cap));

extern CitArtPtr CitArtAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitArtAsnWrite PROTO((CitArtPtr cap, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitArtMatch PROTO((CitArtPtr a, CitArtPtr b));

 

Here is a data access function which retrieves a MEDLINE record (a MedlineEntry) from the Entrez: Sequences CDROM, given a MEDLINE unique identifier (uid). A MedlineEntry contains an article citation (i.e. it reuses the Cit-art object from the bibliographic module then adds the additional index terms and information needed to make a MEDLINE record).

MedlineEntryPtr GetMedline (Int4 uid)

{

   MedlineEntryPtr mep = NULL;

 

   if (! EntrezInit())                   /* intitialize Entrez CDROM */

       return NULL;                      /* failed to initialize */

   mep = EntrezMedlineEntryGet(uid);     /* get the Medline entry */

   EntrezFini();                         /* close CDROM */

   return mep;

}

Here is a code fragment that will exhaustively explore the MedlineEntry structure in memory and call the user supplied callback function when it finds Imprint in the Cit-jour of the Cit-art (i.e. the article was published in a journal which was printed at a particular time). The string "Cit-art.from.jour.imp" defines a path to the journal imprint following the ASN.1 specification given above.

ExploreExample( MedlineEntryPtr mep)

{

   AsnIoPtr aip;

 

   aip = AsnIoNullOpen();                       /* attach a callback below */

   AsnExpOptNew(aip, "Cit-art.from.jour.imp", NULL, GetImprint);

   MedlineEntryAsnWrite(mep, aip, NULL); /* traverse structure */

   AsnIoClose(aip);

   return;

}

 

/*** this is called whenever a journal imprint in an article is found **/

void GetImprint(AsnExpOptStructPtr aeosp)

{

   ImprintPtr ip;

 

   /*

   ** Make sure we are at the beginning of an Imprint

   */

 

   if (aeosp->dvp->intvalue != START_STRUCT) return;

 

   ip = (ImprintPtr)aeosp->the_struct;  /* we have the Imprint */

       /*.... do whatever you want with it */

   return;

}

Finally, here we print out the MedlineEntry in EndNote format and free the memory it used.

MedlineToFile (MedlineEntryPtr mep)

{

   FILE *fp;

 

   fp = FileOpen("test.out", "w");

   MedlineEntryToDocFile (mep, fp);

   FileClose(fp);

   MedlineEntryFree(mep);

   return;

}

Using This Document

This document has a detailed table of contents which can direct you to the topic of interest. For an initial acquaintance with the system, read the Data Model chapter and the introductions to the other chapters. Then, depending on your style and interests, either:

1) Download the software toolkit, build it, and make the demo programs. Print the ".c" files for the demos and look them over. Print the ".asn" files from the \asn directory and look them over. Print the ".h" files from the \object directory. Print "sequtil.h" and "seqport.h" from \api. Print "accentr.h" from \cdromlib. Go back and read the rest of the documentation.

2) Read the documentation by scanning the sections after the introductions in each chapter. Then return in detail to what interests you.

Contacting NCBI

You can download the software tools (all versions) by anonymous ftp to ftp.ncbi.nlm.nih.gov.

cd toolbox\ncbi_tools

bin

get ncbi.tar.Z                       (compressed UNIX tar file)

or

get ncbiZ.exe                       (self extracting DOS archive)

or

get ncbi.sea.hqx                 (self extracting Mac archive)

You can get on an email list to be notified of new releases of software by sending your name, address, institution, and email address to bits-request@ncbi.nlm.nih.gov

You can email to toolbox@ncbi.nlm.nih.gov

You can FAX 301-480-9241, attn. toolbox

You can mail to:

toolbox

NCBI

Bldg 38A, NIH

8600 Rockville Pike

Bethesda, MD 20850

All comments are welcome. If you are part of a larger project or group who wish to make use of the NCBI tools or to establish data exchange with NCBI, please let us know and we will do whatever we can to ensure your success.



Data Model


Introduction
Biological Sequences
Classes of Biological Sequences
Locations on Biological Sequences
Associating Annotation With Locations On Biological Sequences
Collections of Related Biological Sequences
Consequences of the Data Model


 Introduction

The NCBI sequence databases and software tools are designed around a particular model of biological sequence data.  It is designed to provide a few unifying concepts which cross a wide range of domains, providing a path between the domains. Specialized objects are defined which are appropriate within a domain.  In the following sections we will present the unifying ideas, and then examine each area of the model in more detail.

Since we expect that computer technologies will continue to develop at a rapid rate, NCBI has made considerable investment of time and energy to ensure that our data and software tools are not too tightly bound to any particular computer platform or database technology.  However, we also wish to embrace the intellectual rigor imposed by describing our data within a formal system and in a machine readable and checkable way.  For this reason we have chosen to describe our data in Abstract Syntax Notation 1 (ASN.1; ISO 8824, 8825).  Enough explanation will be given here to allow the reader to examine the data definitions.  A much fuller description of ASN.1 and the NCBI software tools which use it appears in later chapters.

  The data specification chapters are arranged by ASN.1 module with detailed discussions of data objects defined in each and the software functions available to operate on those objects.  Each ASN.1 defined object has a matching "C" language structure.  Each "C" structure has at a minimum, a function to create it, write it to an ASN.1 stream, read it from an ASN.1 stream, and destroy it.  Many objects have additional functions.  Some of these are described in the chapter on the module and some with more extensive interfaces are described in additional chapters.  Each module chapter begins with a description of the elements, followed by the full ASN.1 definition of the module, then the "C" code header defining the structures.

This chapter provides an overview of all modules.  Selected ASN.1 definitions are inserted into the body of the text as necessary.  They are also described in the chapter on the appropriate module.

There are two major areas for which data objects have been defined.  One is bibliographic data.  It is clear that this class of information is central to all scientific fields within and outside of molecular biology so we expect these definitions to be widely useful.  We have followed the American National Standard for Bibliographic References (ANSI Z39.29-1977) and consulted with the US Patent Office and professional librarians to ensure complete and accurate representation of citation information.  Unlike biological data, this data is relatively well understood, so we hope that the bibliographic specification can be quite complete and stable.  Despite its importance, the bibliographic specification will not be discussed further here, since it does not present ideas which may be novel to the reader.

The other major area of the specification is biological sequence data and its associated information.  Here the data model attempts to achieve a number of goals.  Biomedical information is a vast interconnected web of data which crosses many domains of discourse with very different ways of viewing the world.  Biological science is very much like the parable of the blind men and elephant. To some of the blind men the elephant feels like a column, to some like a snake, to others like a wall.  The excitement of modern biological research is that we all agree that, at least at some level, we are all exploring aspects of the same thing.  But it is early enough in the development of the science that we cannot agree on what that thing is.

The power of molecular biology is that DNA and protein sequence data cut across most fields of biology from evolution to development, from enzymology to agriculture, from statistical mechanics to medicine.  Sequence data can be viewed as a simple, relatively well defined armature on which data from various disciplines can be hung.  By associating diverse data with the sequence, connections can be made between fields of research with no other common ground, and often with little or no idea of what the other field is doing.

This data model establishes a biological sequence as a simple integer coordinate system with which diverse data can be associated.  It is reasonable to hope that such a simple core can be very stable and compatible with a very wide range of data.  Additional information closely linked to the coordinate system, such as the sequence of amino acids or bases, or genes on a genetic map are layered onto it.  With stable identifiers for specific coordinate systems, a greater diversity of information about the coordinate system can be specifically attached to it in a very flexible yet rigorous way.  The essential differences between different biological forms are preserved, yet they can viewed as aspects of the same thing around the core, and thus move us toward our goal of understanding the totality.

Biological Sequences

A Bioseq is a single continuous biological sequence.  It can be nucleic acid or protein.  It can be fully instantiated (i.e. we have data for every residue) or only partially instantiated (e.g. we know a fragment is 10 kilobases long, but we only have sequence data over 1 kilobase).  A Bioseq is defined in ASN.1 as follows:

 

Bioseq ::= SEQUENCE {

    id SET OF Seq-id OPTIONAL,

    descr Seq-descr ,

    inst Seq-inst ,

    annot SET OF Seq-annot OPTIONAL }

In ASN.1 a named datatype begins with a capital letter (e.g. Bioseq). The symbol "::=" means "is defined as". A primitive type is all capitals (e.g. SEQUENCE).  A field within a named datatype begins with a lower case letter (e.g. descr).  A structured datatype is bounded by curly brackets ({}). We can now read the definition above:  a Bioseq is defined as a SEQUENCE (i.e. a structure where the elements must come in order; the mathematical notion of SEQUENCE, not the biological one).  The first element of Bioseq is called "id" and is a SET OF (i.e. an unordered collection of repeating elements of the same type) a named datatype called "Seq-id". Seq-id would have its own definition elsewhere.  The second element is called "descr" and is a named type called "Seq-descr", which is OPTIONAL.  In this text, when we wish to refer to the id element of the named type Bioseq, we will use the notation "Bioseq.id".

A Bioseq has two OPTIONAL elements, which both have descriptive information ABOUT the sequence.  Seq-descr is a collection of types of information about the context of the sequence.  It may set biological context (e.g. define the organism sequenced), or bibliographic context (e.g. the paper it was published in), among other things.  Seq-annot is information that is explicitly tied to locations on the sequence.  This could be feature tables, alignments, or graphs, at the present time.  A Bioseq can have more than one feature table, perhaps coming from different sources, or a feature table and a graph, etc.

A Bioseq is only REQUIRED to have two elements, id and inst.  Bioseq.id is one or more identifiers for this Bioseq.  An identifier is a key which allows us to retrieve this object from a database or identify it uniquely.  It is not a name, which is a human compatible description, but not necessarily a unique identifier.  The name "Jane Doe" does not uniquely identify a person in the United States, while the identifier, social security number, does.  Each Seq-id is a CHOICE of one of a number of identifier types from different databases, which may have different structures.  All Bioseqs MUST have at least one identifier.

Classes of Biological Sequences

The other required element of a Bioseq is a Seq-inst.  This element instantiates the sequence itself.  It represents things like is it DNA, RNA, or protein?  Circular or linear? Double-stranded or single-stranded?  How long is it?

Seq-inst ::= SEQUENCE {

                repr        ENUMERATED {

                                                not-set (0) ,

                                                virtual (1) ,

                                                raw (2) ,

                                                seg (3) ,

                                                const (4) ,

                                                ref (5) ,

                                                consen (6) ,

                                                map (7) ,

                                                other (255) } ,

                mol        ENUMERATED {

                                                not-set (0) ,

                                                dna (1) ,

                                                rna (2) ,

                                                aa (3) ,

                                                na (4) ,

                                                other (255) } ,

                length   INTEGER            OPTIONAL ,

                fuzz       Int-fuzz                                OPTIONAL ,

                topology ENUMERATED {

                                                not-set (0) ,

                                                linear (1) ,

                                                circular (2) ,

                                                tandem (3) ,

                                                other (255) } DEFAULT linear ,

                strand   ENUMERATED {

                                                not-set (0) ,

                                                ss (1) ,

                                                ds (2) ,

                                                mixed (3) ,

                                                other (255) } OPTIONAL ,

                seq-data Seq-data            OPTIONAL ,

                ext          Seq-ext OPTIONAL ,

                hist         Seq-hist                OPTIONAL }

Seq-inst is the parent class of a sequence representation class hierarchy.  There are two major branches to the hierarchy.  The molecule type branch is indicted by Seq-inst.mol.  This could be a nucleic acid, or further sub classified as RNA or DNA.  The nucleic acid may be circular, linear, or one repeat of a tandem repeat structure.  It can be double, single, or of a mixed strandedness.  It could also be a protein, in which case topology and strandedness are not relevant.

There is also a representation branch, which is independent of the molecule type branch.  This class hierarchy involves the particular data structure used to represent the knowledge we have about the molecule, no matter which part of the molecule type branch it may be in.  The repr element indicates the type of representation used.  The aim of such a set of representation classes is to support the information to express different views of sequence based objects, from chromosomes to restriction fragments, from genetic maps to proteins, within a single overall model.  The ability to do this confers profound advantages for software tools, data storage and retrieval, and traversal of related sequence and map data from different scientific domains.

A virtual representation is used to describe a sequence about which we may know things like it is DNA, it is double stranded, we may even know it's length, but we do not have the actual sequence itself yet.  Most fields of the Seq-inst are filled in, but Seq-inst.seq-data is empty.  An example would be a band on a restriction map.

A raw representation is used for what we traditionally consider a sequence.  We know it is DNA, it is double stranded, we know its length exactly, and we have the sequence data itself.  In this case, Seq-inst.seq-data contains the sequence data.

A segmented representation is very analogous to a virtual representation.  We posit that a continuous double stranded DNA sequence of a certain length exists, and pieces of it exist in other Bioseqs, but there is no data in Seq-inst.seq-data.  Such a case would be when we have cloned and mapped a DNA fragment containing a large protein coding region, but have only actually sequenced the regions immediately around the exons.  The sequence of each exon is an individual raw Bioseq in its own right.  The regions between exons are virtual Bioseqs.  The segmented Bioseq uses Seq-inst.ext to hold a SEQUENCE OF Seq-loc.  That is, the extension is an ordered series of locations on OTHER Bioseqs, in this case the raw and virtual Bioseqs representing the exons and introns.  The segmented Bioseq contains data only by reference to other Bioseqs.  In order to retrieve the base at the first position in the segmented Bioseq, one would go to the first Seq-loc in the extension, and return the appropriate base from the Bioseq it points to.

A constructed Bioseq is used to describe an assembly or merge of other Bioseqs.  It is analogous to the raw representation.  In fact, most raw Bioseqs were actually constructed from an assembly of gel readings.  However, the constructed representation class is really meant for tracking higher level merging, such as when an expert in a particular organism or gene region may construct a "typical" sequence from that region by merging available sequence data, often published by different groups, using domain knowledge to resolve discrepancies between reports or to select a typical allele.  Seq-inst contains an optional Seq-hist object.  Seq-hist contains a field called "assembly" which is a SET OF Seq-align, or sequence alignments.  The alignments are used to record the history of how the various component Bioseqs used for the merge are related to the final product.  A constructed sequence DOES contain sequence data in Seq-inst.seq-data, unlike a segmented sequence, because the component sequences may overlap, or expert knowledge may have been used to determine the "correct" residue at any position that is not captured in the original components.  So Seq-hist.assembly is used to simply record the relationship of the merge to the old Bioseqs, but does NOT describe how to generate it from them.

A map is akin to a virtual Bioseq.  For example, for a genetic map of E.coli, we might posit that the E.coli chromosome is about 5 million base pairs long, DNA, double stranded, circular, but we do not have the sequence data for it.  However, we do know the positions of some genes on this putative sequence.  In this case, the Seq-inst.ext is a SEQUENCE OF Seq-feat, that is, a feature table.  For a genetic map, the feature table contains Gene-ref features.  An ordered restriction map would have a feature table containing Rsite-ref features.  The feature table is part of Seq-inst because, for a map, it is an essential part of instantiating the map Bioseq, not merely annotation on a known sequence.  In a sense, for a map, the annotation IS part of the sequence.  As an aside, note that we have given gene positions on the E.coli genetic map in base pairs, while the standard E.coli map is numbered from 0.0 to 100.0 map units.  Numbering systems can be applied to a Bioseq as a descriptor or a feature.  For E.coli, we would simply apply the 0.0 - 100.0 floating point numbering system to the map Bioseq.  Gene positions can then be shown to the scientists in familiar map units, while the underlying software still treats positions as large integers, just the same as with any other Bioseq.

Coordinates on ANY class of Bioseq are ALWAYS integer offsets.  So the first residue in any Bioseq is at position 0.  The last residue of any Bioseq is in position (length - 1).

The consequence of this design is that one uses EXACTLY the same data object to describe the location of a gene on an unsequenced restriction fragment, a fully sequenced piece of DNA, a partially sequenced piece of DNA, a putative overview of a large genetic region, or a genetic or physical map.  Software to display, manipulate, or compare gene locations can work without change on the full range of possible representations.  Sequence and physical map data can be easily integrated into a single, dynamically assembled view by creating a segmented sequence which points alternatively to raw or constructed Bioseqs and parts of a map Bioseq.  The relationship between a genetic and physical map is simply an alignment between two Bioseqs of representation class map, no different than the alignment between two sequences of class raw generated by a database search program like BLAST or FASTA.

Locations on Biological Sequences

A Seq-loc is an object which defines a location on a Bioseq.  The smooth class hierarchy for Seq-inst makes it possible to use the same Seq-loc to describe an interval on a genetic map as that used to describe an interval on a sequenced molecule.

Seq-loc is itself a class hierarchy.  A valid Seq-loc can be an interval, a point, a whole sequence, a series of intervals, and so on.

Seq-loc ::= CHOICE {

                null                        NULL ,

                empty                   Seq-id ,

                whole                    Seq-id ,

                int                           Seq-interval ,

                packed-int           Packed-seqint ,

                pnt                         Seq-point ,

                packed-pnt         Packed-seqpnt ,

                mix                        Seq-loc-mix ,

                equiv                     Seq-loc-equiv ,

                bond                      Seq-bond ,

                feat                        Feat-id }

Seq-loc.null indicates a region of unknown length for which no data exists.  Such a location may be used in a segmented sequence for the region between two sequenced fragments about which nothing, not even length, is known.

All other Seq-loc types, except Seq-loc.feat, contain a Seq-id.  This means they are independent of context.  This means that data objects describing information ABOUT Bioseqs can be created and exchanged independently from the Bioseq itself.  This encourages the development and exchange of structured knowledge about sequence data from many directions and is an essential goal of the data model.

Associating Annotation With Locations On Biological Sequences

Seq-annot, or sequence annotation, is a collection of information ABOUT a sequence, tied to specific regions of Bioseqs through the use of Seq-loc's.  A Bioseq can have many Seq-annot's associated with it.  This allows knowledge from a variety of sources to be collected in a single place but still be attributed to the original sources.  Currently there are three kinds of Seq-annot, feature tables, alignments, and graphs.

Feature Tables

A feature table is a collection of Seq-feat, or sequence features.  A Seq-feat is designed to tie a Seq-loc together with a datablock, a block of specific data.  Datablocks are defined objects themselves, many of which are objects used in their own right in some other context, such as publications (Pub) or references to organisms (Org-ref) or genes (Gene-ref).  Some datablocks, such as coding regions (CdRegion) make sense only in the context of a Seq-loc.  However, since by design there is no intention that one datablock need to have anything in common with any other datablock, each can be tailored exactly to do a particular job.  If a change or addition is required to one datablock, no others are affected.  In those cases where a pre-existing object from another context is used as a datablock, any software that can use that object can now operate on the feature as well.  For example, a piece of code to display a publication can operate on a publication from a bibliographic database or one use as a sequence feature with no change.

Since the Seq-feat data structure itself and the Seq-loc used to attach it to the sequence are common to all features, it is also possible to support a class of operations over all features without regard to the different types of datablocks attached to them.  So a function to determine all features in a particular region of a Bioseq need not care what type of features they are.

A Seq-feat is bipolar in that it contains up to two Seq-loc's.  Seq-feat.location indicates the "source" and is the location similar to the single location in common feature table implementations.  Seq-feat.product is the "sink".  A CdRegion feature would have its Seq-feat.location on the DNA and it's Seq-feat.product on the protein sequence produced.  Used this way it defines the process of translating a DNA sequence to a protein sequence.  This establishes in an explicit way the important relationship between nucleic acid and protein sequence databases.

The presence of two Seq-loc's also allows a more complete representation of data conflicts or exceptional biological circumstances.  If an author presents a DNA sequence and its protein product in a figure in a paper, it is possible to enter the DNA and protein sequences independently, and then confirm through the CdRegion feature that the DNA in fact translates to that protein sequence.  In an unfortunate number of published papers, the DNA presented does not translate to the protein presented.  This may be a signal that the database has made an error of some sort, which can be caught early and corrected.  Or the original paper may be in error.  In this case, the "conflict" flag can be set in CdRegion, but the protein sequence is not lost, and retroactive work can be done to determine the source of the problem.  It may also be the case that a genomic sequence cannot be translated to a protein for a known biological reason, such as RNA editing or suppressor tRNAs.  In this case the "exception" flag can be set in Seq-feat to indicate that the data are correct, but will not behave in the expected way.

Sequence Alignments

A sequence alignment is essentially a correlation between Seq-locs, often associated with some score.  An alignment is most commonly between two sequences, but it may be among many at once.  In an alignment between two raw Bioseqs, a certain amount of optimization can be done in the data structure based on the knowledge that there is a one to one mapping between the residues of the sequences.  So instead of recording the start and stop in Bioseq A and the start and stop in Bioseq B, it is enough to record the start in A and the start in B and the length of the aligned region.  However if one is aligning a genetic map Bioseq with a physical map Bioseq, then one will wish to allow the aligned regions to distort relative one another to account for the differences from the different mapping techniques.  To accommodate this most general case, there is a Seq-align type which is purely correlations between Seq-locs of any type, with no constraint that they cover exactly the same number of residues.

A Seq-align is considered to be a SEQUENCE OF segments.  Each segment is an unbroken interval on a defined Bioseq, or a gap in that Bioseq.  For example, let us look at the following three dimensional alignment with 6 segments:

 

   Seq-ids

   id=100    AAGGCCTTTTAGAGATGATGATGATGATGA

   id=200    AAGGCCTaTTAG.......GATGATGATGA

   id=300    ....CCTTTTAGAGATGATGAT....ATGA

              | 1 |   2   |   3  | 4| 5 | 6 |  Segments

 

The example above is a global alignment that is each segment sequentially maps a region of each Bioseq to a region of the others.  An alignment can also be of type "diags", which is just a collection of segments with no implication about the logic of joining one segment to the next.  This is equivalent to the diagonal lines that are shown on a dot-matrix plot.

The example above illustrates the most general form of a Seq-align, Std-seg, where each segment is purely a correlated set of Seq-loc.  Two other forms of Seq-align allow denser packing of data for when only raw Bioseqs are aligned.  These are Dense-seg, for global alignments, and Dense-diag for "diag" collections.  The basic underlying model for these denser types is very similar to that shown above, but the data structure itself is somewhat different.

Sequence Graph

The third annotation type is a graph on a sequence, Seq-graph.  It is basically a Seq-loc, over which to apply the graph, and a series of numbers representing values of the graph along the sequence.  A software tool which calculates base composition or hydrophobic tendency might generate a Seq-graph.  Additional fields in Seq-graph allow specification of axis labels, setting of ranges covered, compression of the data relative to the sequence, and so on.

Collections of Related Biological Sequences

It is often useful, even "natural", to package a group of sequences together.  Some examples are a segmented Bioseq and the Bioseqs that make up its parts, a DNA sequence and its translated proteins, the separate chains of a multi-chain molecule, and so on.  A Bioseq-set is such a collection of Bioseqs.

Bioseq-set ::= SEQUENCE {

                id            Object-id             OPTIONAL ,

                coll         Dbtag                    OPTIONAL ,

                level      INTEGER            OPTIONAL ,

                class      ENUMERATED {

                                                not-set (0) ,

                                                nuc-prot (1) ,

                                                segset (2) ,

                                                conset (3) ,

                                                parts (4) ,

                                                gibb (5) ,

                                                gi (6) ,

                                                genbank (7) ,

                                                 pir (8) ,

                                                pub-set (9) ,

                                                equiv (10) ,

                                                swissprot (11) ,

                                                pdb-entry (12) ,

                                                other (255) } DEFAULT not-set ,

                release  VisibleString      OPTIONAL ,

                date       Date                      OPTIONAL ,

                descr     Seq-descr            OPTIONAL ,

                seq-set  SEQUENCE OF Seq-entry ,

                annot     SET OF Seq-annot OPTIONAL }

The basic structure of a Bioseq-set is very similar to that of a Bioseq.  Instead of Bioseq.id, there is a series of identifier and descriptive fields for the set.  A Bioseq-set is only a convenient way of packaging sequences so controlled, stable identifiers are less important for them than they are for Bioseqs.  After the first few fields the structure is exactly parallel to a Bioseq.

There are descriptors which describe aspects of the collection and the Bioseqs within the collection.  The general rule for descriptors in a Bioseq-set is that they apply to "all of everything below".  That is, a Bioseq-set of human sequences need have only one Org-ref descriptor for "human" at the top level of the set, and it is applied to all Bioseqs within the set.

Then follows the equivalent of Seq-inst, that is the instantiation of the data.  In this case, the data is the chain of contained Bioseqs or Bioseq-sets.  A Seq-entry is either a Bioseq or Bioseq-set.  Seq-entry's are very often used as arguments to display and analysis functions, since one can move around either a single Bioseq or a collection of related Bioseqs in context just as easily.  This also makes a Bioseq-set recursive.  That is, it may consist of collections of collections.

Seq-entry ::= CHOICE {

                seq         Bioseq ,

                set          Bioseq-set }

Finally, a Bioseq-set may contain Seq-annot's.  Generally one would put the Seq-annot's which apply to more than one Bioseq in the Bioseq-set at this level.  Examples would be CdRegion features that point to DNA and protein Bioseqs, or Seq-align which align more than one Bioseq with each other.  However, since Seq-annot's always explicitly cite a Seq-id, it does not matter, in terms of meaning; at what level they are put.  This is in contrast to descriptors, where context does matter.

Consequences of the Data Model

This data model has profound consequences for building sequence databases and for researchers and software tools interacting with them.  Assuming that Seq-ids point to stable coordinate systems, it is easily possible to consider the whole set of data conforming to the model as a distributed, active heterogeneous database.  For example, let us suppose that two raw Bioseqs with Seq-ids "A" and "B" are published in the scientific literature and appear in the large public sequence databases.  They are both genomic nucleic acid sequences from human, each coding for a single protein.

One researcher is a specialist in transcription initiation.  He finds additional experimental information involving detailed work on initiation for the flanking region of Bioseq "A".  He can then submit a feature table with a TxInit feature in it to the database with his summarized data.  He need not contact the original author of "A", nor edit the original sequence entry for "A" to do this.  The database staff, who is not experts in transcription initiation, need not attempt to annotate every transcription initiation paper in sufficient detail and accuracy to be of interest to a specialist in the area.  The researcher submitting the feature need not use any particular software system or computer to participate, he need only submit a ASN.1 message which conforms to the specification for a feature.

Another researcher is a medical geneticist who is interested in the medical consequences of mutations in the gene on Bioseq "B".  This individual can add annotation to "B" which is totally different in content to that added by the transcription specialist (in fact, it is unlikely that either follows the literature read by the other) and submit the data to the database in precisely the same way.

A third group may be doing bulk sequencing in the region of the human chromosome where "A" and "B" lie.  They produce a third sequence, "C", which they discover by sequence similarity and mapping data, overlaps "A" at one end and "B" at the other.  This group can submit not just the sequence of "C" but its relationship to "A" and "B" to the database and as part of their publication.

The database now has the information from five different research groups, experts in different fields, using different computer and software systems, and unaware, in many cases, of each other's work, to unambiguously pull together all this related information into an integrated high level view through the use of the shared data model and the controlled Seq-ids on common cited coordinate systems.  This integration across disciplines and generation of high level views of the data is continuously and automatically available to all users and can be updated immediately on the arrival of new data without human intervention or interpretation by the database staff.  This moves scientific databases from the role of curators of scientific data to the role of facilitators of discourse among researchers.  It makes identification of potentially fruitful connections across disciplines an automatic result of data entry, rather than of painstaking analysis by a central group.  It takes advantage of the growing rush of molecular biology data, making its volume and diversity advantages rather than liabilities.



CoreLib: Portable Core Library


Introduction
Application Frameworks
User Interface Elements
Configuration Files
Error Processing
Files and Directories
Memory Management
Byte Stores
String Functions
ValNode Functions
Math Functions
Miscellaneous Utilities
Portability Issues


 Introduction

NCBI has defined a series of header files, basic utility routines, and programming guidelines for the C programming language intended to encourage good programming practice in general and to facilitate the creation of code which will compile and run without change on a variety of hardware platforms under a variety of operating systems and user interfaces, both command line and windowing.  We have developed and tested the system on Intel 80386 and 80486 machines under MS-DOS and Microsoft Windows 3.1, on various Macintosh II machines under Mac-OS, on many different machines under UNIX, on an IBM 3090 running AIX and on VMS VAX. A complete list of systems is given in the README file for the NCBI software toolkit release.

A large number of applications have been written using this set of core tools, and they compile and run without change on all the above system.  While there is clearly no perfect or all inclusive system, we find this one works remarkably well.  This system is not meant to be a universal panacea.  It is meant only to allow the creation of portable code for most of the types of things scientists might want to do on a computer.  It is not expected to support extremely interactive or graphically oriented programs, nor is it meant to support extremely computation resource limited applications.  It is to make an average application portable and robust.

Application Frameworks

In C, there is a common programming model that we are all accustomed to, in which command-line arguments are made available to the program's main function and the stdin and stdout streams are used for input and output data.  However, with some of the modern graphical user interfaces, this may not be convenient or desired.  Furthermore, graphical interfaces generally require substantial initialization before any application code runs and may require specific steps be taken before application exits.  The exact steps required vary widely from one platform to the next.

To simplify the process of writing programs that run in all of these situations, let us introduce the notion of an application framework, which takes care of whatever initialization and termination steps may be required and provides a uniform mechanism for obtaining program arguments.  The NCBI Toolkit provides two application frameworks.  The first is part of CoreLib and is extremely simple, but useful for "quick-and-dirty" tool development.  The second is provided by Vibrant and supports the full look-and-feel of the target graphical interface (described elsewhere in this manual).

It is important to note that the use of these frameworks is purely optional.  Any NCBI Toolbox function, with the single exception of the GetArgs, may be called from any application, whether it a simple UNIX filter program or a full-blown Macintosh application.

Main Entry Point

To use this simple framework, you should write a function called Main, which you can think of this as the entry point to your program.  In fact, the true entry point (main or WinMain or whatever) is a function within CoreLib, which will perform whatever initilization is required for the platform and then call Main.  From Main you will call functions to perform the task for which the program is designed and then Main should return zero on success or non-zero on failure, as in the following example.

#include <ncbi.h>

 

Int2 Main()

{

    if (!DoSearch("swissprot","query.aa",12,0.1,"blast.out"))

          return 1;  /* failure */

 

    return 0;   /* success */

}

You should include the C header file ncbi.h to get the function definition (prototype) for Main, as well as all other functions, types, and constants described in this chapter.  It turn includes various other headers, such as one that contains platform-specific definitions (ncbilcl.h) and others that define the interfaces to the various modules (for example, the memory management functions are defined in ncbimem.h).  Since the order of inclusion may be important in certain instances, it is safest to simply include ncbi.h.

Getting Program Arguments

Notice that Main takes no parameters.  How, then, will your program get access to arguments that may be supplied by the user?  Early in the program, you should make a single call to a function called GetArgs to obtain the program's run-time arguments.  Actually, GetArgs does more than just get arguments; it may also prompt the user for input, validate the arguments against allowed ranges, and convert them to the appropriate integer or floating point types.  The Arg structure contains all of the information required to do this and contains storage for the values returned by the function.

Boolean GetArgs (CharPtr progname, Int2 argcount, Arg *arglist)

Gets arguments for the program named progname.  The  argument specifications are stored in arglist, an array of Arg structures containing argcount elements.  The Arg structure has the following definition:

typedef struct {

    char *prompt;         /* Visible prompt for user */

    char *defaultvalue;   /* Default value */

    char *from;                /* Low value in allowed range */

    char *to;             /* High value in allowed range */

    Boolean    optional;  /* Is this argument optional? */

    char  tag;            /* Command-line switch */

    Int1  type;           /* Data type */

    FloatHi    floatvalue;     /* Returned floating point value */

    Int4  intvalue;       /* Returned integer value */

    CharPtr    strvalue;  /* Returned string value */

} Arg, *ArgPtr;

The arguments on the command line are expected to consist of the dash (-) character, a single letter tag, and finally the argument value.  There may be a space between the tag and the value.  For example, "-F fname" and "-Ffname" are equivalent.  The field called type determines how the argument is interpreted as well as where it is stored in the Arg structure.

Datatype Symbol

Data Type Description

Storage Field in Arg

ARG_BOOLEAN

TRUE/FALSE value

intvalue

ARG_INT

Integer value

intvalue

ARG_FLOAT

Floating point value

floatvalue

ARG_STRING

String value

strvalue

ARG_FILE_IN

Name of input file

strvalue

ARG_FILE_OUT

Name of output file

strvalue

ARG_DATA_IN

Datalink in

strvalue  [[ VERIFY ]]

ARG_DATA_OUT

Datalink out

strvalue  [[ VERIFY ]]

Arguments are considered to be required unless the optional field is TRUE.  The user will be prompted for all non-optional arguments not given on the command line using the string supplied in prompt.  Optional arguments not supplied by the user are assigned to defaultvalue (may be NULL).  All numerical arguments are converted from strings to either integer or floating point and validated against the valid range defined by the from and to fields (may be NULL for no validation).  If all non-default arguments have been supplied and validated, TRUE is returned.  If not, or if the only argument given is "-", the program usage is shown and the function returns FALSE.

The example above may be extend as follows.

#include <ncbi.h>

 

Arg arg[] = {

  {"Database","nr",NULL,NULL,FALSE,'D',ARG_STRING,0.0,0, NULL},

  {"Query file","query.aa",NULL,NULL,FALSE,'Q',ARG_FILE_IN,0.0,0, NULL},

  {"Threshold","13","5","25",TRUE,'T',ARG_BOOLEAN,0.0,0,NULL},

  {"Expect","0.1","0.01","10",TRUE,'E',ARG_FLOAT,0.0,0,NULL },

  {"Output file","blast.out",NULL,NULL,FALSE,'O',ARG_FILE_OUT,0.0,0, NULL}};

 

Int2 Main()

{

    if (!GetArgs("demo",DIM(arg),arg))

          return 1;  /* failure */

 

    if (!DoSearch(arg[0].strvalue,arg[1].strvalue,

               arg[2].intvalue,arg[3].floatvalue,arg[4].strvalue))

          return 1;  /* failure */

 

    return 0;   /* success */

}

The Vibrant version of GetArgs produces a dialog box containing the prompt strings edit fields into which the user may enter the values.  If all required arguments are supplied, TRUE is returned.

User Interface Elements

We find it useful to include a minimal set of functions in the core library to provide feedback to the user for such purposes as displaying messages (alerts), providing audible feedback (beeps), and indicating the progress of lengthy operations (monitors).  However, we recognize that a significant amount of customization is needed to suit the tastes and requirements of individual applications programmers using this Toolkit.  Indeed, every single user interface element described below may be replaced by one of your own design.  This is done by registering hook functions with the library that will be called to generate the desired effects.  Without this, your program will get the default functionality provided by CoreLib, which is extremely simple and uses primarily console I/O.  Programs featuring a graphical interface will almost certainly want to install hook functions to provide something more elegant.  For example, the Vibrant application framework installs hooks for all user interface elements prior to calling your Main function.

Alerts

Alerts are used to show a message to the user, which in some cases may be in the form of a question with a small number of possible answers. 

MsgAnswer MsgAlert (MsgKey key, ErrSev sev, const char *capt,
const char *fmt, ...)

Generates a message string using the format string fmt and a variable number of arguments.  The key parameter is used to specify the list of possible user responses and may be any of the following constants.

Symbol                                    Description

KEY_NONE                        No response requried (console) or OK button (graphical)

KEY_OK                               OK button

KEY_OKC                            OK and Cancel buttons

KEY_YN                              Yes and No buttons

KEY_YNC                           Yes, No and Cancel buttons

KEY_RC                               Retry and Cancel buttons

KEY_ARI                             Abort, Retry and Ignore buttons

Two additional parameters, a caption string capt and a severity code sev, may be supplied if desired.  Although they  are ignored in the default MsgAlert processing provided by CoreLib, these two arguments are passed through to the message hook function (if any) for use in graphical alerts.  The caption string is intended for use in the caption bar of the alert window (if it has one) and is normally the name of the application.  The severity code is for use in selecting an icon to appear in the content area of the window beside the message text.  Any of the severity constants listed for the ErrPostEx function (described later in this chapter) may be used.

MsgAnswer MsgAlertStr (MsgKey key, ErrSev sev,
const char *caption, const char *str)

Same as MsgAlert except that the message str is a single string instead of a format string and argument list.

MsgAnswer Message (Int2 option, const char *fmt, ...)

Displays a message to the user that is generated from the format specification string fmt and a variable list of arguments.  The option argument modifies the behavior of the function and may be any one of the following.

Symbol                                    Description

MSG_ERROR                     Beep, show the message, and wait for an acknowlegement from the user before continuing.

MSG_FATAL                      Beep, show the message, then halt the program by calling the AbnormalExit function.

MSG_OK                              Show the message and wait for an acknowledgement from the user before continuing (press the OK button or wait for a keypress).

MSG_OKC                           Show the message and prompt for OK/Cancel.

MSG_YN                             Show the message and prompt for Yes/No.

MSG_YNC                          Show the message and prompt for Yes/No/Cancel.

MSG_RC                              Show the message and prompt for Retry/Cancel.

MSG_ARI                            Show the message and prompt for Abort/Retry/Ignore.

MSG_POST                         Show the message and continue (in graphical interfaces, the alert must generally be dismissed by explicit action of the user).

MSG_POSTERR                Beep, show the message and continue.

Message calls MsgAlert to actually display the message.  If an application property has been installed (see SetAppProperty) with the key "AppName", it is used as the caption when calling MsgAlert.  Otherwise, there is no caption.  The function result is an enumerated type and may be any of the following values.

typedef enum MsgAnswser

{

    ANS_NONE,

    ANS_OK,

    ANS_CANCEL,

    ANS_ABORT,

    ANS_RETRY,

    ANS_IGNORE,

    ANS_YES,

    ANS_NO

}

ASN_NONE is returned for the options that do not require any user response (MSG_POST and MSG_POSTERR).


MsgHook SetMessageHook (MsgHook hook)

Installs hook as the function to be called for showing messages.  A pointer to the previous hook function (if any) is returned, so that it is possible to later restore it.  The message hook function should have the following form.

MsgAnswer LIBCALLBACK MyMessageHook (MsgKey key, ErrSev sev,

               const char *caption, const char *message)

{

    MsgAnswer answer;

 

    /* Create a dialog box using caption as the title.  Within

    *  content region, show message and an icon selected using

    *  sev.  Place buttons on the dialog based on the value of

    *  key.  Wait for the user to press a button, then destroy

    *  the dialog window and return the appropriate answer code. */

   

    return answer;

}

Beeps

void Beep ()

Sounds an audible beep.

BeepHook SetBeepHook (BeepHook hook)

Installs hook as the function to be called for sounding beeps.  The function takes no arguments and has no return value.  The return value is a pointer to the previous BeepHook.

void LIBCALLBACK MyBeepHook ()

{

    PlaySoundFile("beep.snd");
}

 

Int2 Main ()

{

    SetBeepHook(MyBeepFunction);

 

         ...etc...

 

    return 0;

}

Monitors

Monitors are user interface elements used to indicate the status or progress of potentially lengthy operations.  There are two general types of monitors.  The first is the string monitor, which displays a series of strings, one after the other.  The second is the integer range monitor, which indicates progress of an operation as an integer value within some defined range.  String monitors and integer monitors must be created before they can be used using either MonitorStrNew or MonitorIntNew, respectively and destroyed when they are no longer needed using MonitorFree

In addition to these, there is the notion of a default progress monitor which may be used by calling ProgMon, even though these may have been initialized in a completely different code module or not initialized at all.  Normally, the default monotor is created in the top-level application code and registered with the system by calling SetProgMon.

The monitor functionality provided by CoreLib is appropriate for use in situations where console I/O is used.  For applications having a graphical interface or for console-style programs in which customized monitor behavior is desired, you can write your own monitor hook function to implement the user interface and install it by calling SetMonitorHook.  Programs using Vibrant need not do this as the application framework takes care of installing hook functions for all user interface elements.

MonitorPtr MonitorStrNew (const char *title, Int2 len)

Creates a new string monitor with the caption title and returns a pointer to it.  The maximum length of any string value is supplied as the len argument.  NULL is returned on failure.

Boolean MonitorStrValue (MonitorPtr mon, const char *sval)

Sets the value of the string monitor mon to the string sval.  The return value indicates success or failure.

MonitorPtr MonitorIntNew (const char *title, Int4 n1, Int4 n2)

Creates a new integer monitor with the caption title whose extent is from n1 to n2 and returns a pointer to it.  NULL is returned on failure.

Boolean MonitorIntValue (MonitorPtr mon, Int4 ival)

Sets the value of the integer monitor mon to ival.  The return value indicates success or failure.

MonitorPtr MonitorFree (MonitorPtr mon)

Frees the monitor mon, which may be either of the integer or string monitor class.  The return value is always NULL.


MonitorHook SetMonitorHook (MonitorHook hook)

Installs hook as the function to be called to carry out monitor activities.  The value of the previous hook function is returned.  The hook function should have the following form.

int LIBCALLBACK MyMonitorHook (Monitor *mon, MonCode code)

{

    switch (code)

    {

          case MonCode_Create :

               /* allocate memory & create interface elements here */

               if (failure)

                     return FALSE;

               break;

          case MonCode_Destroy :

               /* free memory & destroy interface elements here */

               break;

          case MonCode_IntValue :

               /* */

               break;

          case MonCode_StrValue :

               /* */

               break;

          default :

               return FALSE;

    }

    return TRUE;
}

Boolean SetProgMon (ProgMonFunc hook, VoidPtr data)

Installs hook as the function to be called for default progress monitor handling.  A pointer to an arbitrary data block data and a string are passed to the hook function when it is called.  In the example below, a normal string monitor is used for default processing.

Boolean LIBCALLBACK MyProgMonHook (VoidPtr data, CharStr str);

Monitor *defProgMon;

 

Int Main ()

{

    defProgMon = MonitorStrNew(“Progress Messages”,80);

    SetProgMon(MyProgMonHook,(void*)defProgMon);

 

         ... do stuff ...

 

    MonitorFree(defProgMon);

    return 0;
}

 

Boolean LIBCALLBACK MyProgMonHook (void *data, const char *str)

{

    return MonitorStrValue((Monitor*)data,str);

}

Boolean ProgMon (CharPtr str)

Pass the string str to the default progress monitor.  If no default monitor has been installed with SetProgMon, calling this function has no effect.  The return value is whatever was returned by the default monitor hook function.

 

Configuration Files

A scheme for storing and modifying persistent system and application configuration options is provided.  It is modeled on services provided in the Microsoft Windows environment and has been extended to work all of the platforms that we support.

File Names

Since each platform may have its own convention for naming configuration files, we have opted to use a common basename from which the actual filename can be derived as appropriate for the system.  This is described in the table below, where xxx represents the basename.

Platform

File Name

Locations searched

UNIX

.xxxrc

1. Path from NCBI environment variable
2. User's home directory
3. Current working directory

VMS

.xxxrc

1. Path from NCBI environment variable
2. User's home directory
3. Current working directory

Macintosh

xxx.cnf

1. System Folder:Preferences
2. System Folder

MS-DOS

xxx.cfg

1. Path from NCBI environment variable
2. Current directory

MS-Windows

xxx.ini

1. Windows directory

File Format

Configuration files are plain ASCII text files that may be edited by the user.  They are divided into sections, each of which is headed by the section name enclosed in square brackets.  Below each section heading is a series of key=value strings, somewhat analogous to the environment variables that are used on many platforms.  Any line that begins with a semi-colon is considered a comment.  The following lines serve as an example of what may appear in a settings file:

[General]

AsnLoad = c:\ncbi\asnload

AsnData = c:\ncbi\asndata

 

[CD-ROM]

path = E:\

 

[NetService]

; Note: set USERNAME = ? to be prompted for your username

username=?

host=dispatcher@ncbi.nlm.nih.gov

timeout=30

 

Configuration File Functions

Boolean SetAppParam (const char *filebase, const char *sect, const char *key, const char *val)

Sets the value of key to val in section sect of the configuration file specified by filebase.  The return value indicates success or failure.

Boolean TransientSetAppParam (const char *filebase,
const char *sect, const char *key, const char *val)

Sets a configuration value like SetAppParam, except that the setting exists only in memory and is not written to the configuration file.

int GetAppParam (const char *filebase, const char *sect,
const char *key, const char *dflt, char *buf, int buflen)

Searches section sect of the configuration file specified by filebase for key and returns its value in the buffer buf.  If key is not found, the default value dflt is copied to buf.  The return value is the number of characters copied to buf, which may be up to buflen-1.

Boolean FindPath (const char *filebase, const char *sect,
const char *key, char *buf, int buflen)

Gets a configuration setting by passing the supplied arguments to GetAppParam (with NULL as the default) and then ensures that the returned string is of the proper form for a filesystem path on the particular platform.

Error Processing

The core library includes functions for posting, reporting, logging, and handling whatever error conditions may be encountered during program execution.  An important concept is that indicating that an error occurred, or posting an error, can be functionally decoupled from the handling of that error.  The function ErrPostEx is provided for posting an error along with an indication of its severity.  If no special provisions have been made, default processing of the error will occur, which may include (depending on the severity) displaying the error to the user and halting the program.  However, there are a number of ways to customize this behavior.  The simplest is to adjust the severity level that will be displayed to the users or that will result in a fatal program exit using ErrSetMessageLevel and ErrSetFatalLevel, respectively.  For maximal control, you can use ErrSetHandler to install your own function that will be called whenever an error is posted.

The software toolkit provides the ability to keep a log of all posted errors, which we have found quite useful as an aid to debugging or for producing reports on large data processing runs.  Error logging is performed at the time an error is posted, regardless of how or when the error is ever handled.  Logging is disabled by default; to enable it, use ErrSetOptFlags with EO_LOGTO_USRFILE as the argument.  The name of the file can be modified with the ErrSetLogfile function.

When interpreting an error message, it is sometimes useful to know something about the context in which the error was posted.  For example, knowing that an error is from the ASN.1 function library as opposed to the network services library might be of assistance in diagnosing problems with a client program that retrieves ASN.1 data from a network service.  In the past we have used defined integer context codes for this purpose.  However, for a variety of reasons, we now prefer to use a string to indicate the context, or module, in which the error occurred.  At a finer granularity, you might want to know the filename and line number in the C source file in which the error was posted, but that is mainly of interest to programmers and not shown by default.  In order to allow some context information to be captured with minimal effort, we make use of two macros, THIS_MODULE and THIS_FILE, which you can (but are not required to) define once at the top of each source file.  Both represent strings and may be defined as NULL if they are to be ignored.  If you do not define them at all, you will inherit the default definitions from ncbierr.h:

#ifndef THIS_MODULE

#define THIS_MODULE  NULL

#endif

#ifndef THIS_FILE

#define THIS_FILE  __FILE__

#endif

ErrPostEx is actually implemented as a macro, which passes these two strings, along with the line number, to the toolbox functions (this obviates the need for several additional arguments).  Since not all linkers will merge duplicate strings, it is usually best to instantiate string variables for the module and filename and define the macros as aliases.  Without doing this it is possible to end up with one copy of each string for each expansion of the ErrPostEx macro. 

A typical example would be:

static char *this_module = "MyModule";

#define THIS_MODULE  this_module

static char *this_file = __FILE__;

#define THIS_FILE this_file

 

#include <ncbi.h>

 

If you wish to include the ncbi.h header file first, then you can undefine the symbols prior to redefining them. 

A recent enhancement to the error processing code is support for error message files.  These files may contain information allowing you to (1) convert integer error codes to a mnemonic string on output, (2) provide a verbose explanatory message to be appended to the standard error message, and (3) specify the severity level to be used for any error.  The files are plain ASCII text and fairly easy to edit, so they may be used to customize error reporting according to the preferences of individual users.

Posting An Error

void ErrPost (int context, int errcode, const char *fmt, ...)

Posts a fatal error that is defined by errcode and described to the user by means of a string that is generated from the format string fmt a variable number of arguments.  The context argument is effectively the equivalent of the module, but it is only displayed if THIS_MODULE has not been defined to anything (i.e., if it is defined to NULL, as it is in ncbierr.h).

NOTE: This is an old function that has been retained for compatibility purposes.  New code should use ErrPostEx instead.

int ErrPostEx (ErrSev sev, int errcode, int subcode,
const char *fmt, ...)

Posts an error of severity sev.  The error is defined by errcode and subcode and described to the user by means of a string that is generated from the format string fmt a variable number of arguments.  The return value is the same as that returned by ErrPostStr (see below).  The possible severity codes are:

Symbol                                    Description

SEV_INFO                           Purely an informational message, not a true error.

SEV_WARNING               Warning of a possible error condition.

SEV_ERROR                       An error has occurred but execution can continue.

SEV_FATAL                       An fatal (non-continuable) error has occurred.

int ErrPostStr (ErrSev sev, int errcode, int subcode,
const char *str)

Posts an error as described for the ErrPostEx function except that str contains the descriptive error text as a single string instead of a format string plus variable argument list (hence, it can be called from programs that are written in languages other than C or C++). 

Both ErrPost and ErrPostEx call ErrPostStr after formatting the string and this is where the real work takes place.  First, an internal ErrDesc structure is populated with all of the information describing the error that occurred.  If logging is enabled and sev is greater than or equal to the current LogLevel, this information is then logged according to whatever style flags have been set using the ErrSetOptFlags function.  The user-supplied error handler function is given the first opportunity to handle the error.  If it returns zero or if there is no such function, default processing takes place.  If sev is greater than or equal to the current MessageLevel, Message is called to display the error to the user.  If sev is greater than or equal to the current FatalLevel, the program is halted by calling AbnormalExit.

The return value is one of the "answer codes" (e.g. ANS_OK) that may be returned by the MsgAlert function.  If the programmer has installed an error handler function, it should return one of these codes if it handles the error or zero otherwise.  If MsgAlert was called as a result of default error processing, its result value is returned to the caller.  If neither of these is true, zero is returned.

User Error Strings

One thing we have found to be quite useful is the ability to include in the error messages additional strings defined by the user (meaning the programmer in this case) in order to provide additional information about the context in which the error occurred.  For example, image that you have a program that streams through every record in a sequence database performing some sort of analysis or calculation.  Before processing each record you could use ErrUserInstall supplying (say) its accession number as the string.  Then, if any error occurs during the run, you would know which record was being processed at the time because its accession number would be part of the error message.

ErrStrId ErrUserInstall (const char *msg, ErrStrId id)

If id is zero, the string msg is added to the list of user-defined error strings and a unique id value for that string is returned.  Otherwise, the text of an existing entry in the list identified by id is replaced with msg.

Boolean ErrUserDelete (ErrStrId id)

Deletes the user-defined error string identified by id (returned by ErrUserInstall).

void ErrUserClear ()

Clears the entire list of user-defined error strings.

Customization

int ErrSetFatalLevel (ErrSev level)

Sets the minimum severity that will result in a fatal exit to level.   The return value is the previous setting.  The default value is SEV_FATAL, but changing it to SEV_MAX will prevent the application aborting.

int ErrGetFatalLevel ()

Returns the current FatalLevel value.

int ErrSetMessageLevel (ErrSev level)

Sets the minimum severity that will be displayed to the user (via the Message function) to level.The return value is the previous setting.  The default value is SEV_WARNING, but setting it to SEV_MAX will disable all error reporting.

int ErrGetMessageLevel ()

Returns the current MessageLevel setting.

int ErrSetLogLevel (ErrSev level)

Sets the minimum severity that will be logged to level.  The return value is the previous setting.  The default value is SEV_INFO.  Note that one of the log output channels (logfile, stderr, or trace) must be enabled before any logging will occur.

int ErrGetLogLevel ()

Returns the current LogLevel setting.

int ErrSetLogfile (const char *filename, unsigned long flags)

Sets the name of the error log file to filename (from the default name "error.log").  Note that the ER_LOG_USRFILE flag must be set (see below) to actually enable logging to the named file.  The flags may be any of the following, which may be combined with the bitwise-OR operator.

Symbol                                    Description

ELOG_BANNER               Writes a banner line with the current time and date.

ELOG_APPEND               Appends to an existing file (if there is one).

ELOG_NOCREATE         Do not attempt to create the file at this time (wait until the first error is posted).  Ignored if ELOG_BANNER given.

unsigned long ErrSetOptFlags (unsigned long flags)

Sets one or more bit-flags, which should be combined with the bitwise-OR operator into the flags argument.  The flags may be any of the following [default state in brackets]:

Symbol                                    Description

EO_LOG_SEVERITY       Log an indication of the severity (e.g. "WARNING") to the file [yes]

EO_MSG_SEVERITY       Show an indication of the severity (e.g. "WARNING") to the user [yes]

EO_SHOW_SEVERITY   EO_LOG_SEVERITY | EO_MSG_SEVERITY

EO_LOG_CODES             Log the module name, error code, and subcode [yes]

EO_MSG_CODES             Show the module name, error code, and subcode to the user [yes]

EO_SHOW_CODES         EO_LOG_CODES | EO_MSG_CODES

EO_LOG_FILELINE         Log the source file and line number at which ErrPostEx was called [yes]

EO_MSG_FILELINE        Show the source file and line number to the user [no]

EO_SHOW_FILELINE    EO_LOG_FILELINE | EO_MSG_FILELINE

EO_LOG_USERSTR         Log programmer-defined error strings [yes]

EO_MSG_USERSTR        Show programmer-defined error strings to the user [yes]

EO_SHOW_USERSTR    EO_LOG_USERSTR | EO_MSG_USERSTR

EO_LOG_ERRTEXT        Show the error message to the user [yes]

EO_MSG_ERRTEXT        Log the error message [yes]

EO_SHOW_ERRTEXT    EO_LOG_ERRTEXT | EO_MSG_ERRTEXT

EO_LOG_MSGTEXT       Retrieve and log the verbose explanatory text from a message file [no]

EO_MSG_MSGTEXT       Retrieve the verbose explanatory text from a message file and and present it to the user [no]

EO_SHOW_MSGTEXT   EO_LOG_MSGTEXT | EO_MSG_MSGTEXT

EO_XLATE_CODES        Translate the integer error code and subcode into the mnemonic strings defined in an error message file.  (If the file cannot be found, the integer values are displayed as if this flag were not set.)

EO_BEEP                             Produce an audible beep when displaying an error to the user [no]

EO_WAIT_KEYPRESS    Wait for the user to press a key or button before continuing [no]

EO_PROMPT_ABORT    Prompt the user as to whether to abort [no]

EO_LOGTO_USRFILE    Log to the error log file [no]

EO_LOGTO_STDOUT    Log to stdout [no]

EO_LOGTO_STDERR     Log to stderr [no]

EO_LOGTO_TRACE       Log to the "trace device" (see TRACE, below) [no]

unsigned long ErrClearOptFlags (unsigned long flags)

Clears one or more bit-flags (see above), which may be combined with the bitwise-OR operator into the flags argument.

unsigned long ErrTestOptFlags (unsigned long flags)

Tests one or more bit-flags (see above), which may be combined with the bitwise-OR operator into the flags argument.

void ErrSaveOptions (ErrOpts *erropt)

Copies all error option settings to the local buffer pointed to by erropt.  This will include severity levels for logging, displaying, and aborting as well as all option flags.  This should be done prior to changing any settings if you intend to later restore the state.

void ErrRestoreOptions (const ErrOpts *erropt)

Restores the error options state using the information that was previously captured using ErrSaveOptions in the buffer pointed to by erropt.

Configuration File Settings

The main configuration file for the NCBI toolkit (variously called .ncbirc, ncbi.ini, or ncbi.cfg, etc., depending on the platform) may contain settings within the "ErrorProcessing" section to provide additional runtime customization.  Here are some example settings:

[ErrorProcessing]

 

;  need to tell the system where the message files are kept

MsgPath=/sun/ncbi/errmsg

 

SEV_INFO    = "==> note    "

SEV_WARNING = "==> WARNING "

SEV_ERROR   = "==> ERROR   "

SEV_FATAL   = "==> FATAL   "

 

;  override a few of the option flags

EO_SHOW_MSGTEXT = 1   ;always show me everything...

EO_BEEP = 0           ;...but those beeps drive me nuts

The MsgPath key is used to tell the system where to look for error message files.  If this setting is not present, only the current directory will be examined.  Failure to locate the error message file will not prevent any application from running.  Instead, it will simply not be possible to convert integer error codes to strings or to display verbose error messages.

The strings that are used to indicate the severity of the error ("WARNING", for example) may be modified if desired.  To do so, use the same symbols used to indicate severity in your code (e.g., SEV_WARNING) as the key with the desired string as the value.  In the example above, the strings are quoted, but this is only required if leading or trailing spaces are to be included in the string.

In a similar fashion, each of the option flags may be set or cleared by using the symbol for that flag as the key and either 1 (one) or 0 (zero) as the value (alternatively, you can use YES/NO or TRUE/FALSE).  Note that these settings override anything that the programmer may have chosen to implement.  For example, if the configuration file contained the line EO_BEEP=0, there would be no beeps sounded on an error even if the code explicitly contained the command ErrSetOptFlags(EO_BEEP).


Preparing Error Message Files

You can use your favorite text editor to prepare error message files as they are plain ASCII text.  However, the name of the file is significant; it must be derived from the module name by converting to all lower case characters and appending the ".msg" extension.  For example, the message file for the "CoreLib" module should be called "corelib.msg" (shown below).  The first line of the file should consist of the keyword "MODULE" followed by the name of the module (e.g. "CoreLib" in the example below). 

MODULE CoreLib

 

$$ NoMemory, 1, SEV_FATAL

 

$$ File, 2, SEV_INFO

 

$^   Open, 1

This often indicates that the file simply does not exist.

Alternatively, it may exist but you do not have permission to

access the file in the requested mode.

 

$^   Read, 2

Not sure what would cause this...

 

$^   Write, 3, SEV_FATAL

This may indicate that the filesystem is full.

 

$$ Math, 3

$^   Param, 1

$^   Domain, 2

$^   Range, 3

$^   Iter, 4

 

$$ SGML, 4

$^   Init, 1

$^   Braces, 2

$^   Entity, 3

Lines beginning with "$$" are used to define a main-level error code.  The first two (comma delimited) tokens on the line are the mnemonic string and integer representations of the error.  In the example above, the string "NoMemory" is equated to error code 1.  These two tokens are required, but a third optional token may be supplied to specify a severity level to be used when posting that error.  Note that this overrides the severity used by the programmer and therefore allows for runtime customization of the program.  In the example above, all "NoMemory" errors would be fatal.

In a similar manner, lines beginning with "$^" may be used to define subcodes within the scope of the maincode that appears above it.  In the example above, "Open" is a subcode within "File".  A subcode can inherit a severity from its parent if it does not have one of its own.  For example, SEV_INFO would used for all "File" errors with the exception of "Write", which would be SEV_FATAL.

Below any maincode or subcode line you may (optionally) enter a block of text to be used as the verbose error message.  This is appended to the actual error message posted by the program and is intended to provide additional explanation.  A common pitfall is to repeat the error message in the explanation.  For example, you would probably not want to begin the explanation for File.Write with "A file write error has occurred" because this would almost certainly be in the original error message. 

Fetching and Displaying Errors

int ErrPeek ()

Returns a non-zero value if an error pending (i.e., has been posted but not yet processed).

int ErrCopy (ErrDesc *errdesc)

In an error is pending, information about the error is copied to the local buffer pointed to by errdesc and a non-zero value is returned.  The error is not cleared by this function and may still be displayed by calling ErrShow. Zero is returned if no error is pending.

void ErrClear ()

Clears a pending error, if there is one.

int ErrFetch (ErrDesc *errdesc)

Copies the description of the pending error, if there is one, to errdesc and then clears the error condition (functionally equivalent to ErrCopy followed by ErrClear).

int ErrShow ()

If an error is pending and its severity is greater than or equal to the MessageLevel setting, it is displayed to the user via a call to the MsgAlert function.  If the EO_PROMPT_ABORT option flag has been set, the message includes the question "Abort, Retry, or Ignore ?".  In this case, if the users responds "Abort", the program aborts (otherwise, execution continues and either ANS_RETRY or ANS_IGNORE is returned so the caller may decide whether or not to distinguish between these alternatives).  If the EO_PROMPT_ABORT bit is cleared (as it is by default), the program will abort if the severity of the error is greater than or equal to the FatalLevel.

Installing Custom Error Handlers

ErrHookProc ErrSetHandler (ErrHookProc hook)

Installs hook as the function to be called when an error posted.  If error logging is enabled, the error will already have been logged before the hook function is called.  The hook function takes a pointer to an ErrDesc structure and should return a non-zero value if it handled the error (preferably ANS_OK) and zero if it did not.  In the latter case, the system will perform the default error handling, which may involve displaying the error and/or halting the program. 


Here's an Example:

#include <ncbi.h>

 

int LIBCALLBACK MyErrorHandler (const ErrDesc *err)

{

    if (strcmp(err->cntxstr,"CoreLib") ==0 &&

               err->errcode == E_NoMemory)

    {

          Beep(); Beep(); Beep(); /* something they'll notice! */

          ReleaseLifeboat(); /* free up memory reserves */

          return ErrShow();

    }

    return 0;  /* zero means we didn't handle the error */

}

 

int main (int argc, char **argv)

{

    ErrSetHandler(MyErrorHandler);

 

    ... do stuff ...

 

    return 0;

}

 

Miscellaneous Utility Functions

void ErrLogPrintf (const char *fmt, ...)

Formats a string using a printf-style format string fmt and a variable-length list of arguments and then writes it to any error logging streams that may have been enabled.

void ErrLogPrintStr (const char *str)

Similar to ErrLogPrintf, except that str is a single string to be written to the error logging streams (for users of programming languages other than C or C++).

void AbnormalExit (int)

Terminates the program immediately.  This function should only be called if an application is not capable of exiting any other way.  Cleanup code (e.g. closing files and sockets) will not generally get called before program halts.  On some systems, calling this function may also invoke a debugger if one has been installed.

Files and Directories

[[ ...insert text here... ]]

ANSI-Style Functions

NCBI Toolbox

ANSI C

Description

FileOpen

fopen

Opens a file for reading or writing

FileClose

fclose

Closes an open file

FileRead

fread

Reads bytes from an open file

FileWrite

fwrite

Writes bytes to an open file

FileGets

fgets

Reads a string from an open file

FilePuts

fputs

Writes a string to an open file

Directory Management

Boolean CreateDir (char *pathname)

Creates a directory called pathname.  The return value indicates success or failure.

void FileCreate (char *fileName, char *type, char *creator)

[[ ...insert text here... ]]

char* TmpNam (char *tmp)

Generates a unique temporary file name.  If a pointer to a string buffer is supplied as the tmp argument, the name is placed there; otherwise it is formatted in a static buffer within the library.  Either way, the return value points to the generated temporary file name.

Boolean FileRemove (char *fileName)

Removes the file fileName, if it exists.  The return value indicates success or failure.

Boolean FileRename (char *oldFileName, char *newFileName)

Changes the name of a file from oldFileName to newFileName.  The file cannot generally be moved from one directory to another, so no path should be included.  The return value indicates success or failure.

char* FileBuildPath (char *root, char *subpath, char *filename)

[[ ...insert text here... ]]

char* FileNameFind (char *pathname)

Returns a pointer to the filename portion of pathname.  For example, on a Macintosh system, FileNameFind("harddisk:System Folder:Excel Settings") would return a pointer to "Excel Settings".

Int4 FileLength (char *fileName)

Returns the length (in bytes) of the file called fileName.  If the length cannot be determined, zero is returned.

CD-ROM

Boolean EjectCd (char *sVolume, char *deviceName, char *rawDeviceName, char *mountPoint, char *mountCmd)

Ejects a CD-ROM from the device.  This function has no effect on DOS and Microsoft Windows systems.

Parameter

Operating System

Description

sVolume

MacOS

Volume name, e.g. “SEQDATA”

deviceName

UNIX

Name of CD-ROM device, e.g. “/dev/sr0”

rawDeviceName

UNIX

Name of raw device, e.g. “dev/rsr0”

mountPoint

UNIX, VMS

Filesystem location where CD-ROM data should be mounted, e.g. “/cdrom”

mountCmp

UNIX

A script or program that performs the ejecting and mounting actions.  For many UNIX systems, mounting requires super-user privileges

Boolean MountCd (char *sVolume, char *deviceName,
char *mountPoint, char *mountCmd)

Mounts a CD-ROM.  The parameters are the same as those described for EjectCd, except that the raw device is not needed.  This function has no effect on DOS and Microsoft Windows systems.


Customization

void SetFileOpenHook (FileOpenHook hook)

Installs hook as the function to be called by FileOpen to actually open the file. The arguments of the hook function are the same as those for FileOpen.  In the following example, the hook function looks to see if a full path is given and, if so, first attempts to open a file of the same name in the current directory.  If that fails, a normal file open is performed using the fopen function.

FILE* LIBCALLBACK  MyFileOpenHook (const char *fname, const char *fmode);

 

Int 2 Main ()

{

    SetFileOpenHook (myFileOpenHook);

 

    ... do stuff ...

 

    return 0;
}

 

FILE* LIBCALLBACK  MyFileOpenHook (const char *fname, const char *fmode)

{

    /* Note:  FileOpen checks for NULL arguments, so we don’t

          have to do it here */

 

    char *p = strchr(fname,DIRDELIMCHR);

    if (p != NULL)

    {

          FILE *fd = fopen(p+1,fmode);

          if (fd != NULL)

               return fd;

    }

    return fopen(fname,fmode);
}

 

Memory Management

Services for the dynamic allocation and deallocation of memory differ widely among platforms.  All of them provide standard ANSI functions, such as malloc, which allocates non-relocatable memory blocks that are referenced by pointers.  In addition, the Macintosh and Microsoft Windows environments provide the ability to allocate relocatable memory blocks, which is designed to reduce heap fragmentation.  Relocatable blocks are referenced by handles instead of pointers.  They must be locked before use and a valid pointer is obtained as part of the lock operation. When they are later unlocked, the pointer becomes invalid.  For most routine uses, we recommend using fixed memory.  Clearly, fixed memory is easier to use (since locking is unnecessary), and ongoing advances in chip architectures and system software on the microcomputer platforms are eliminating the performance advantage that relocatable memory currently offers.

ANSI-Style Functions

NCBI Toolkit

ANSI C

Description

Malloc

malloc

Allocates memory

Calloc

calloc

Allocates memory

Realloc

realloc

Changes the size of a previously allocated block.

Free

free

Frees a memory block.

MemCopy

memcpy

Copies a range of bytes .

MemMove

memmove

Copies a range of bytes (source and destination may overlap).

MemFill

memset

Sets a range of bytes to a particular value.


Fixed Memory

void* MemGet (size_t size, unsigned int flags)

Allocates a fixed memory block containing size bytes using options encoded in flags and returns a pointer to it. The flags may be any of the following:

Symbol                                    Description

MGET_CLEAR                  Clears the allocated memory to zeros.

MGET_ERRPOST             Posts an error (severity ERR_FATAL) on memory allocation failure.

void* MemNew (size_t bytes)

General-purpose fixed memory allocator that calls MemGet with the MGET_CLEAR and MGET_ERRPOST flags.

void* MemMore (void *ptr, size_t size)

Changes the size of a fixed memory block ptr to size bytes.  On failure, NULL is returned.

void* MemExtend (void *ptr, size_t size, size_t oldsize)

Changes the size of a fixed memory block ptr, the current size of which must be supplied as oldsize, to a new size of size.  If size is greater than oldsize, the additional memory is cleared to zeros.  On failure, NULL is returned.

void* MemDup (const void *ptr, size_t size)

Duplicates a fixed memory block ptr, the size of which must be supplied as the size argument.  On failure, NULL is returned.

void* MemFree (void *ptr)

Frees the mixed memory block ptr (if it is non-NULL).  The return value is always NULL.

Relocatable Memory

We provide functions for the manipulation of relocatable memory, but on systems where this is not available, fixed memory is used instead (and the type Handle is equivalent to Pointer).

Handle HandGet (size_t size, Boolean clear)

Allocates a moveable memory block containing size bytes.  On failure, NULL is returned (no error is posted).

Handle HandNew (size_t size)

Allocates a moveable memory block containing size bytes and clears it to zeros.  On failure, an error is posted (SEV_FATAL) and NULL is returned.

Handle HandMore (Handle hnd, size_t size)

Changes the size of moveable block hnd to size bytes.  On success, the return value is the re-sized block, which may or may not be the same as hnd.  On failure, NULL is returned. 

Handle HandFree (Handle hnd)

Frees moveable memory block hnd (if non-NULL).  The return value is always NULL.

void* HandLock (Handle hnd)

Locks moveable memory block hnd and returns a pointer to it.  The pointer remains valid until HandUnlock is called. 

void* HandUnlock (Handle hnd)

Unlocks moveable memory block hnd that was previously locked with HandLock.

NOTE: Ensure that you do not have nested HandLock/HandUnlock calls.  Although some systems will handle this situation properly, others do not.  Notably, on the Macintosh, HandUnlock will always unlocked by HandUnlock regardless of how many times it was locked.

Byte Stores

We have implemented an additional type of dynamic storage called a ByteStore.  It is designed to look and behave much like an unformatted file, but its data exist in memory (however, due to various virtual memory schemes, the data may be in files after all!).  A ByteStore is especially useful for storing large amounts of data that would normally exceed the limits imposed by systems using 16-bit memory addressing.

A ByteStore is created by BSNew and the pointer it returns is a requried argument for the remaining ByteStore functions.  At the end of its lifespan, it is deallocated by the BSFree function.

A ByteStore has a logical length, which is returned by BSLen and corresponds to the number of bytes of data it contains.  The physical length of a ByteStore is actual amount of memory allocated and is often larger than the logical length.  All functions that add data to a ByteStore automatically take care of increasing the physical length to accommodate the logical length.

As with file I/O, a ByteStore uses the notion of a current position that is used for reading and writing data.  The functions BSSeek and BSTell provide a means of setting and querying the current position.  The functions BSRead, BSWrite, BSGetByte, and BSPutByte are analogous to functions used in file I/O.  However, unlike file I/O, an block of data may be inserted or deleted internally using BSInsert and BSDelete.

ByteStorePtr BSNew (long len)

Creates a ByteStore with an initial physical length of len and returns a pointer to it.  If len is zero, a default physical size is used.

ByteStorePtr BSDup (ByteStorePtr bs)

Creates a new ByteStore that is a copy of bs.

void* BSMerge (ByteStorePtr bs, void *buff)

Copies all of the data in ByteStore bs to a single memory buffer buff.  If buff is NULL, the function will allocate a buffer of the correct size.  Otherwise, it is the responsibility of the caller to ensure that the buffer is at least as large as the value returned by BSLen.  The return value is the pointer to the buffer containing the merged data.

ByteStorePtr BSFree (ByteStorePtr bs)

Deallocates ByteStore bs (if bs is NULL, nothing happens).  The return value is always NULL.

long BSLen (ByteStorePtr bs)

Returns the logical length of ByteStore bs.

Int2 BSSeek (ByteStorePtr bs, long offset, int origin)

Moves the current position of ByteStore bs to offset bytes from the point indicated by origin, which may be any of the following:

Symbol                                    New position

SEEK_SET                           offset bytes from the beginning of the ByteStore

SEEK_END                         offset bytes from the end of the ByteStore

SEEK_CUR                          offset bytes from the current position

long BSTell (ByteStorePtr bs)

Returns the current position of ByteStore bs.

long BSWrite (ByteStorePtr bs, void *buff, long len)

Writes len bytes of data from the memory buffer buff to the current position of ByteStore bs.  Following this operation, the current position will be increased by the number of bytes written.  The return value is the same as len if the write was successful or zero if not. 

long BSRead (ByteStorePtr bs, void *buff, long len)

Attempts to read len bytes of data from the current position of ByteStore bs to the buffer buff.  The return value is the number of bytes actually read, which may be less than len if the logical end of data is reached.  Following this operation, the current position will be increased by the number of bytes read.

long BSInsert (ByteStorePtr bs, void *buff, long len)

Inserts len bytes from memory buffer buff before the current position in ByteStore bs.  The current position is then increased by len so that it points to the position just after the inserted range.  The return value is the same as len if the insertion was successful or zero if not. 

long BSInsertFromBS (ByteStorePtr bs, ByteStore *bs2, long len)

Inserts len bytes into ByteStore bs by reading them from a second ByteStore bs2.  The return value is the actual number of bytes transferred, and the current positions of both ByteStores will be increased by this amount.

long BSDelete (ByteStorePtr bs, long len)

Deletes len bytes from ByteStore bs beginning at the current position (which is not changed by the operation).  The return value specifies the actual number of bytes deleted, which may be less than len if the logical end of data was reached.

Int2 BSPutByte (ByteStorePtr bs, int b)

Inserts the byte b at the current position of ByteStore bs and advances the position by one.  If b is equal to the constant EOF, the ByteStore is truncated at the current position.  The return value is b on success or EOF on failure.

Int2 BSGetByte (ByteStorePtr bs)

Returns the byte at the current position of ByteStore bs and advances the position by one.  If the logical end of data has been reached, the constant EOF is returned.

String Functions

ANSI-Style Functions

NCBI Toolkit

ANSI C

Description

StringLen

strlen

Gets string length

StringCpy

strcpy

Copies a string

StringNCpy

strncpy

Copies a string (n chars)

StringCat

strcat

Catenates strings

StringNCat

strncat

Catenates strings (n chars)

StringCmp

strcmp

Compares strings

StringNCmp

strncmp

Compares strings (n chars)

StringChr

strchr

Searches for a character in a string (from beginning)

StringRChr

strrchr

Searches for a chatacter in a string (from end)

StringPBrk

strpbrk

Searches for first characters in a string that is a member of a specified set

StringStr

strstr

Searches for a substring in a string

StringSpn

strspn

Counts leading characters that are members of a specified set

StringCSpn

strcspn

Counts leading characters that are not members of a specified set

StringSet

strset

Sets all characters of a string to a specified character

StringNSet

strnset

Sets up to n characters of a string to a specified character

StringTok

strtok

Breaks a string into tokens

Refer to ANSI C documentation for details.

Additional String Functions

int StringICmp (const char *a, const char *b)

Compares strings a and b like StringCmp, but ignoring case (assumes ASCII character set). 

int StringNICmp (const char *a, const char *b, size_t n)

Compares up to the first n characters of strings a and b like StringNCmp, but ignoring case (assumes ASCII character set).

char* StringMove (char *dst, const char *src)

Copies the string src to dst and returns a pointer to the null byte that terminates the concatenated string.

char* StringSave (const char *str)

Copies str to a dynamically allocated memory block an returns a pointer to that block.

size_t StringCnt (const char *str, const char *list)

Searches the string str for any of the characters of the string list and returns the number of occurrences found.

Number Strings

Several functions are provided for converting integers to ASCII strings.  The following option flags determine how the string is formatted (may be combined with the bitwise-OR operator).

Symbol                                    Description

MISC_COMMAS               Insert commas only when |value| >= 10,000

MISC_ALLCOMMAS      Insert commas for any |value| >= 1,000

MISC_ANYCOMMAS     Both MISC_COMMAS and MISC_ALLCOMMAS

MISC_PLUSSIGNS           Prepend a plus sign (+) to positive values               

char* Ltostr (long x, int opts)

Converts the integer value x to ASCII using options opts.

int Lwidth (long x, int opts)

Returns the length of the string that would result from the conversion of integer value x to ASCII using options opts.

char* Ultostr (unsigned long x, int opts)

Converts the unsigned integer value x to ASCII using options opts.

int Ulwidth (unsigned long x, int opts)

Returns the length of the string that would result from the conversion of unsigned integer value x to ASCII using options opts.

Time Strings

Boolean DayTimeStr (char *buf, Boolean date, Boolean time)

Gets the current calendar time and generates a string representation in buf. containing the date and/or time as specified by the date and time arguments.  The buffer should be of sufficient size to hold 24 characters.

SGML Strings

[[ ...insert text here... ]]

char* Sgml2Ascii (const char *sgml, char *ascii, size_t buflen)

Converts the SGML string in sgml into a printable ASCII string and copies it to ascii.  The buflen parameter gives the length of the ascii buffer.  The return value is the same as ascii [[check this]].

size_t Sgml2AsciiLen (const char *sgml)

Returns the length of the printable ASCII string that would result from convesion from SGML text.

ValNode Functions

A ValNode is a simple data structure that allows a mixture of data types to be grouped into a linked list. It contains a "choice" slot, which is used to discriminate the datatype held in the union called "data". ValNodes are used extensively in ASN.1 objects to represent CHOICE, SEQUENCE OF, and SET OF types. They are also used in other NCBI functions where a very flexible linked list is required.

typedef union dataval

{

    VoidPtr ptrvalue;

    Int4 intvalue;

    FloatHi realvalue;

    Boolean boolvalue;

} DataVal, *DataValPtr;

 

typedef struct valnode

{

    Uint1 choice;              /* to pick a choice */

    DataVal data;              /* attached data */

    struct valnode *next;      /* next in linked list */

} ValNode, *ValNodePtr;

ValNodePtr ValNodeNew (ValNodePtr node)

Creates a new ValNode and returns a pointer to it.  If desired, the newly-created node may be attached to the end a linked list, of which node is the tail element.  Otherwise node should be NULL.

ValNodePtr ValNodeAdd (ValNodePtr *head)

Creates a new ValNode and returns a pointer to it.  The head argument points to a variable that contains the head element of a linked list of ValNodes to which the new node should be appended.  If head contains NULL, it will be initialized with the pointer to the newly-created ValNode.

ValNodePtr ValNodeAddBoolean (ValNodePtr *head, Int2 choice, Boolean bool)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.boolvalue member to bool.  The return value is the new ValNode.

ValNodePtr ValNodeAddInt (ValNodePtr *head, Int2 choice, Int4 value)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.intvalue to value.  The return value is the new ValNode.

ValNodePtr ValNodeAddFloat (ValNodePtr *head, Int2 choice, FloatHi value)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.intvalue to value.  The return value is the new ValNode.

ValNodePtr ValNodeAddStr (ValNodePtr *head, Int2 choice, CharPtr str)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.ptrvalue member to the string str.  The string is not copied to allocated storage.  The return value is the new ValNode.

ValNodePtr ValNodeCopyStr (ValNodePtr *head, Int2 choice, CharPtr str)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.ptrvalue member to a copy of the string str.  The return value is the new ValNode.

ValNodePtr ValNodeAddPointer (ValNodePtr *head, Int2 choice, Pointer ptr)

Creates a new ValNode by calling ValNodeAdd and sets its choice member to choice and its data.ptrvalue to ptr.  The return value is the new ValNode.

ValNodePtr ValNodeLink (ValNodePtr *head, ValNodePtr node)

Adds node to the end of a linked list whose head element is in the variable pointed to by head.  If head contains NULL, it is initialized with to the value of node.  The return value is always the head element of the linked list.

ValNodePtr ValNodeFree (ValNodePtr node)

Frees an entire list of ValNode structures of which node is the head element.  Whatever data may be referenced in the data member is not freed. The return value is always NULL.

ValNodePtr ValNodeFreeData (ValNodePtr vn)

Frees a list of ValNode structures like the ValNodeFree function, except that associated data is also freed.  This function should only be used if it is known that the data.ptrvalue member of every node in the list is either NULL or a valid pointer to a single fixed memory block.

ValNodePtr ValNodeExtract (ValNodePtr *head, Int2 choice)

Scans the linked list whose head element is in the variable pointed to by head for the first node whose choice element is equal to choice.  If found, the node is unlinked from the list and returned as the function result.  If it is not found, NULL is returned.

ValNodePtr ValNodeExtractList (ValNodePtr *headptr, Int2 choice)

Scans the linked list whose head element is in the variable pointed to by head for all nodes whose choice element is equal to choice.  The return value is the head element of a linked list of all such nodes.

ValNodePtr ValNodeFindNext (ValNodePtr head, ValNodePtr curr, Int2 choice)

Scans a linked list of ValNodes for a node whose choice member is equal to choice and returns a pointer to it.  The search begins with curr, if non-NULL, or head otherwise.  If choice is negative, the next node is returned.

ValNodePtr NodeListNew (void)

[[ ...insert text here... ]]

ValNodePtr NodeListFree (ValNodePtr head)

[[ ...insert text here... ]]

Int2 NodeListLen (ValNodePtr node)

Returns the number of elements in the linked list of which node is the head element.

ValNodePtr NodeListFind (ValNodePtr head, Int2 item, Boolean extend)

[[ ...insert text here... ]]

Boolean NodeListRead (ValNodePtr head, Int2 item, VoidPtr ptr, size_t size)

[[ ...insert text here... ]]

Boolean NodeListWrite (ValNodePtr head, Int2 item, VoidPtr ptr, size_t size)

[[ ...insert text here... ]]

Boolean NodeListAppend (ValNodePtr head, VoidPtr ptr, size_t size)

[[ ...insert text here... ]]

Boolean NodeListInsert (ValNodePtr head, Int2 item, VoidPtr ptr, size_t size)

[[ ...insert text here... ]]

Boolean NodeListReplace (ValNodePtr head, Int2 item, VoidPtr ptr, size_t size)

[[ ...insert text here... ]]

Boolean NodeListDelete (ValNodePtr head, Int2 item)

[[ ...insert text here... ]]

Math Functions

Macros

Macro

Description

LN2

Natural logarithm of 2

LN10

Natural logarithm of 10

EXP2(x)

Base-2 exponential of x

LOG2(x)

Base-2 logarithm of x

EXP10(x)

Base-10 exponential of x

LOG10(x)

Base-10 logarithm of x

Arithmatic Functions

long Gcd (long a, long b)

Returns the greatest common divisor of a and b.

long Nint (double x)

Returns the nearest integer to x.

Transendental Functions

double Log1p (double x)

Returns log(x+1) for all x > -1

double Expm1 (double x)

Returns exp(x)-1 for all x

double Powi (double x, int n)

Returns the integral power of x

double Factorial (int x)

Returns x! (x factorial)

Gamma Functions

double Gamma (double x)

gamma(x)

double LnGamma (double x)

log(gamma(x))

double LnGammaInt (int n)

log(gamma(n)), integral n

double DiGamma (double x)

digamma(x) 1st order derivative of log(gamma(x))

double TriGamma (double x)

trigamma(x) 2nd order derivative of log(gamma(x))

double PolyGamma (double x, int order)

Nth order derivative of log(gamma)

void GammaCoeffSet (double *coef, unsigned dimension)

Change gamma coefficients

Advanced Functions

double LogDerivative (int order, double *u)

Nth order derivative of ln(u)

double NRBis (double y, double(*f) (double), double (*df) (double), double p, double x, double q, double tol)

Combined Newton-Raphson and Bisection root solver

double RombergIntegrate (double(*f)(double, VoidPtr), void *fargs, double p, double q, double eps, int epsit, int itmin)

Romberg numerical integrator

Miscellaneous Utilities

Macros

Macro

Description

ABS(a)

Returns the absolute value of a (any numerical type).

SIGN(a)

Returns -1 if a is negative, +1 if it is positive, or 0 if it is zero.

MIN(a,b)

Returns the maximum of a and b (any numerical type).

MAX(a,b)

Returns the minimum of a and b (any numerical type).

ROUNDUP(a,b)

Rounds a up to the nearest multiple of b.

DIM(a)

Returns the dimension (number of elements) in the array a.

Random Numbers

void RandomSeed (long seed)

Sets the seed value of the random number generator to seed.

long RandomNum ()

Returns the next value in the series of pseudo-random numbers.

Sorting

void HeapSort (void *base, size_t nel, size_t size,
int (LIBCALLBACK *cmp)(VoidPtr,VoidPtr))

Sorts an array of elements, which may be of any basic or structured type.  The starting address of the array is base, with nel and size being the number and size of elements in the array.  A pointer to an element comparison function cmp must also be supplied.

Time

time_t  GetSecs ()

Returns the current value of a timer that ticks once per second.

Boolean  GetDayTime (struct tm *dtp)

Returns the current time in broken-down format.

Process ID

long  GetAppProcessID ()

Returns a unique number identifying the process.

Application Properties

We will refer to named block of arbitrary data associated with a single application instance as an application property.  Application properties have two main uses.  First, they allow for isolation of application instance data in certain shared library contexts where the data space would normally be shared by all applications using the library.  Second, they allow for a simple level of communication between code modules without requiring that they “know” anything about each other.  For example, during initialization of your program, you might create a property called “ProgramName” with a string giving the name of your program.  Other code modules might then use this property when generating various messages and reports.  Application properties are identified by a string with case being significant. They are created or modified with SetAppProperty, retrieved with GetAppProperty, and destroyed with RemoveAppProperty.  If you want to scan through the property list, use EnumAppProperties with a pointer to a callback function to be called once for each property.

void* SetAppProperty (const char *key, void *data)

Installs data, identified by key, as a property of the current application.

void* GetAppProperty (const char *key)

Returns the property data associated with key.  If the property does not exist, NULL is returned.

void* RemoveAppProperty (const char *key)

Removes the property of the current application that is identified by key, if there is one.  If the data pointer for this property is returned as the function result and it is the responsibility of the programmer to release whatever dynamic memory may be involved.

int EnumAppProperties (AppPropEnumProc proc)

Calls the user-supplied function proc once for each property of the current application.

Debugging Macros

The following macros are designed to assist in debugging during program development (or more precisely to prevent the need for debugging!) and are only enabled if the macro _DEBUG is defined during compilation.  However, when the time comes to build a "release version" to distribute to end-users, they can be easily disabled by recompiling without _DEBUG defined.

void TRACE (const char *fmt, ...)

Formats a string using fmt as a printf-style format string along with a variable number of arguments and then writes it to the "trace device".  What the trace device actually represents differs with the platform and compiler switches.  Under Microsoft Windows, traced messages go to the debugger console (AUX) if it is running.  Although similar facilities may exist on other platforms, none are supported at present (but we will entertain any suggestions you may have). For this reason the default “trace device” on UNIX systems is “stderr” and on all other platforms is to a file called “trace.out”. This behavior may be circumvented by doing the following prior to including <ncbi.h>:

 

#define one of these symbols:

                TRACE_TO_STDOUT

                TRACE_TO_STDERR

                TRACE_TO_AUX (Windows only)

                TRACE_TO_FILE  (goes to “trace.out”)

Followed by:

                TRACE_DEVICE_DEFINED  (inhibits redefinition)

 

Note that all the above only makes TRACEing possible, but does not enable the feature. To do so, compile selected files with the symbol _DEBUG defined. This is _not_ done in the default makefiles. When _DEBUG is not defined, TRACE() has no effect.

ASSERT(expression)

If _DEBUG is defined, asserts that expression is TRUE.  If it evaluates to FALSE, a message is displayed giving the expression and the file name and line number where the assertion failed.  After this, the program halts through a call to AbnormalExit.  If _DEBUG is not defined, expression is never evaluated. 

VERIFY(expression)

Similar to ASSERT, except that expression is always evaluated.  This should be used if the expression contains an assignment or function call that should be executed regardless of whether or not _DEBUG is defined.

 

Portability Issues

There are always a variety of factors conspiring to hinder the portability of C code despite the best intentions of the programmer.  These barriers are due to differences in hardware, operating systems, compilers, and filesystems. 

We have attempted to sequester all system-specific definitions into a single header file called ncbilcl.h (which is included by ncbi.h)  It contains defined symbols describing the platform as well as type definitions and often a variety of macros.  The NCBI Toolkit includes a version of this file for each of the supported platforms.

Filename

Hardware

Operating System

Compiler

ncbilcl.370

IBM 370

AIX

System V cc

ncbilcl.acc

Sun SPARC

SunOS

Sun acc

ncbilcl.alf 

DEC Alpha-XP

OSF/1

DEC C compiler

ncbilcl.aov 

DEC Alpha-XP

OpenVMS

BSD cc

ncbilcl.aux

Macintosh 68K

AU/X

AU/X

ncbilcl.bor

Intel PC

MS-DOS

Borland C/C++

ncbilcl.bwn 

Intel PC

Windows DOS

Borland C/C++

ncbilcl.ccr 

Sun SPARC

SunOS

CodeCenter

ncbilcl.cpp

Sun SPARC

SunOS

Sun C++

ncbilcl.cra

Cray YMP

Unicos

Cray C compiler

ncbilcl.cvx 

CONVEX

UNIX System V

BSD cc

ncbilcl.gcc 

Sun SPARC

SunOS

Gnu gcc or g++

ncbilcl.hp

HP PA-RISC

HP-UX

System V cc

ncbilcl.mpw

Macintosh 68K

MacOS

Apple MPW C

ncbilcl.msc 

Intel PC

MS-DOS

Microsoft C

ncbilcl.msw 

Intel PC

Windows DOS

Microsoft C

ncbilcl.nxt

Next

NextStep

Next C compiler

ncbilcl.r6k

IBM RS 6000

AIX

System V cc

ncbilcl.sgi 

SGI MIPS

UNIX System V

System V cc

ncbilcl.sol 

Sun SPARC

Sun Solaris

SunPro

ncbilcl.sun

Sun SPARC

SunOS

BSD cc

ncbilcl.thc

Macintosh 68K

MacOS

Symantec C/C++

ncbilcl.ult

DEC MIPS

ULTRIX

System V cc

ncbilcl.vms

VAX

OpenVMS

BSD cc


Portable Types

In C, the sizes of basic types vary with each compiler implementation.  Certain minimum sizes are guaranteed by the ANSI standard, however.  The choice of which type to use in any particular situation may be based on the required precision and the natural word size of the hardware.  Always use the sizeof operator rather than assuming any particular size.

We have defined the following types.

Integral Types

Type

Description

Size

Min. Value

Max. Value

Boolean

A TRUE or FALSE value

1

FALSE

TRUE

Byte

Smallest unit of storage that a C program can address (unsigned)

1

0

UINT_MAX

Char

ASCII character occupying one byte of storage (may be either signed or unsigned)

1

CHAR_MIN

CHAR_MAX

Uchar

Unsigned ASCII character

1

UCHAR_MIN

UCHAR_MAX

Int1

Signed integer, 1 byte

1

INT1_MIN

INT1_MAX

Uint1

Unsigned integer, 1 byte

1

0

UINT1_MAX

Int2

Signed integer, 2 bytes

2

INT2_MIN

INT2_MAX

Uint2

Unsigned intege, 2 bytes

2

0

UINT2_MAX

Int4

Signed integer, 4 bytes

4

INT4_MIN

INT4_MAX

Uint4

Unsigned integer, 4 bytes

4

0

UINT4_MAX

Floating-point Types

Type

Description

Min. Value

Max. Value

FloatLo

Low-precision floating point value (same as float)

FLT_MIN

FLT_MAX

FloatHi

High-precision floating point variable (same as double)

FLT_MAX

DBL_MAX

Pointer Types

Type,

Description

VoidPtr

Generic pointer (same as Pointer)

BoolPtr

Pointer to Boolean

BytePtr

Pointer to Byte

CharPtr

Pointer to Char

UcharPtr

Pointer to Uchar

Int1Ptr

Pointer to Int1

Uint1Ptr

Pointer to Uint1

Int2Ptr

Pointer to Int2

Uint2Ptr

Pointer to Uint2

Int4Ptr

Pointer to Int4

Uint4Ptr

Pointer to Uint4

FloatHiPtr

Pointer to FloatHi

FloatLoPtr

Pointer to FloatLo

FnPtr

Generic function pointer

Pointer

Generic pointer (same as VoidPtr)

Handle

Generic handle. Points to a block of memory that is moveable on Macintosh & Windows. On other platforms it is the same as a Pointer

Avoiding Name Collisions

The types are first typedeffed with names like Nlm_Int2.  Then they are defined with easier to use names like #define Int2 Nlm_Int2.  A similar procedure is used in declaring the utility functions.  This is because one wishes to treat them in your program as real data types.  However, if a conflict with a typedeffed name in some other program or header occurs, one cannot "untypdef" things, and it's a problem to use the other headers.  #defines can be undefined which solves the conflict problem.  We typedef with "Nlm_..." in the expectation that there will be no conflict with the name. We then #define that to something easier to remember, but more likely to conflict, and get the best of both worlds. The defined types are listed below

Byte Order

The order of bytes within any integral value of size greater than 1 is defined by the hardware. 

Although other orderings are possible, none of the platforms we support has such a configuration.  One of the following symbols should be defined in every ncbilcl.xxx.

Symbol

Description

IS_BIG_ENDIAN

The target platform is "big endian", having the most significant byte in the lowest address.

IS_LITTLE_ENDIAN

The target platform is "little endian", having the most significant byte in the highest address.

Function Prototypes

A mechanism has also been worked out for declaring functions and prototypes such that compilers which can check function prototypes will check them, and those which don't do not see them (prototypes are syntax errors on older compilers !).  The trick is to declare the prototype with the PROTO(()) macro (note the double parentheses).  A similar macro, VPROTO(()), is provided for functions with variable argument lists.

Int2 StringCmp PROTO((CharPtr str1, CharPtr str2));

Int2 Message VPROTO((Int2 key, char *fmt, ...));



AsnLib: ASN.1 Processing


Introduction to ASN.1
AsnLib: Overview
Principles of Operation
Specification for AsnLib
AsnTool
AsnTool Tutorial
Using AsnLib
AsnLib: A Tutorial
Data-links
AsnLib Generated Header Files
Returns From AsnLib Parsing
Finding AsnTypePtrs at Run-time
Custom Read and Write Functions
Customizing an AsnIo Stream
ASN.1 Object Loaders
AsnLib and Object Loaders As a Generalized Iterator
AsnLib and Object Loaders Provide a Generalized Copy and Compare
AsnLib Interface: asn.h


 Introduction to ASN.1

Why ASN.1

Abstract Syntax Notation 1 (ASN.1) is used to describe the structure of data to be transferred between the Application Layer and the Presentation Layer of the Open Systems Interconnection (OSI).  It is meant to provide a mechanism whereby the Presentation Layer can use a single standard encoding to reliably exchange any arbitrary data structure with other computer systems, while the Application layer can map the standard encoding into any type of representation or language that is appropriate for the end user.  ASN.1 does not describe the content, meaning, or structure of the data, only the way in which it is specified and encoded.

These properties make it an excellent choice for a standard way of encoding scientific data.  Since ASN.1 does not specify content, specifications can be created as new concepts need to be represented.  Yet since it is an International Standards Organization (ISO) standard, the new specification can take advantage of various tools built to work with ASN.1 in general.  It removes from scientists the role of specifying ad hoc file formats, and focuses them instead on specifying the content and structure of data necessary to convey scientific meaning.

There are two aspects to ASN.1, the specification of the data and the encoded data itself.  The specification describes the abstract structure of the data and the allowed values various fields may take.  Frequently today scientific data is presented with no formal specification.  There may be some documentation describing the data file, but very often it is incomplete or not entirely accurate, since it is usually written about the file, rather than as an integral step toward building the file.  The ASN.1 specification is formal language, which means it can be automatically and thoroughly checked for errors and inconsistencies in form by machine before any data are collected at all.  Further, it can be used by a computer to validate that any data presented correctly reflect that specification.  This is essential in eliminating the random errors and oversights in generating data files that plague scientific data now.  A utility program, asntool, was built with the AsnTool libraries to do this sort of checking and validation while developing ASN.1 specifications.

The requirement for a separate specification also means that interested parties can examine and evaluate the structure of the data independent of any particular database or data file.  One can understand the limits and strengths of a specification separately from the quality or amount of the data itself. Data structures that prove to be useful can be re-used in a variety of ways; by large public databases, by small private databases, in various software tools, and in assorted data files.

Finally, a separate specification means software to construct, decode, and validate any ASN.1 specified object can be built semi- or fully automatically from the specification.  Data encoded according to that specification can then be processed with relatively little manual programming for those aspects of the application dealing directly with ASN.1.  This is what the AsnTool routines are for.

Structure of ASN.1

ASN.1 has Type References, identifiers, and values.  A Type Reference is the name of an object defined in an ASN.1 specification.  An identifier is a field within an object.  A value is generally not included in the specification, but rather is the value of a Type Reference or an identifier in data encoded in ASN.1.  Values can be encoded in either a text or a binary form.  The examples here will obviously be in the text form.

Type References ALWAYS start with an upper case letter.  Identifiers ALWAYS start with a lower case letter.  Values depend on what type of value it is (integer, string, etc.) and examples are given below.  "-" (hyphen) is the ONLY separator character allowed in References and identifiers.

ASN.1 allows elements of SET, SEQUENCE, and CHOICE to not have identifiers if they can be distinguished from each other by their type (e.g. one is an integer and one is a string).  However, this can make the text value notation ambiguous and it may also lead to errors in the hands of the novice.  So we REQUIRE that every element of a SET, SEQUENCE, and CHOICE have an identifier.

ASN.1 also allows the specification of numerical tags (used for the binary encoding) in [] in addition to or in lieu of identifiers.  Again, this can be a problem for the novice.  Since we require identifiers, our software generates the numerical tags itself and we can ignore this.  It still supports explicitly defined APPLICATION, and PRIVATE tags, but that is beyond the scope of this document.  Comments begin with   --   and end with   --    or end of line.

A simple ASN.1 specification module example is shown below:

Demo-module DEFINITIONS ::=       -- Module-name DEFINITIONS ::= BEGIN

BEGIN

 

EXPORTS My-type;                         -- My-type can be used by other modules

 

IMPORTS Foreign-type FROM Other-module; -- can import types

 

                                         -- we define an object called My-type

My-type ::= SEQUENCE {                   -- My-type is a Type Reference

   first     INTEGER ,                  -- first is an identifier

   second    INTEGER DEFAULT 2 ,        -- second defaults to 2

   third     VisibleString OPTIONAL     -- third is an optional string

   }                                     -- end of object definition

 

Another ::= Foreign-type                 -- can reference other defined types

 

END                                      -- end of module, END required

Value notation (or data encoded in the text form of ASN.1) looks like this:

My-type ::= {

   first 42

   }

This means this My-type will have first = 42, second = 2, and third not present.  To present more than one My-type you must have defined:

 

My-type-set ::= SET OF My-type           -- in Demo-module

 

  Then you could have:

My-type-set ::= {                        -- start SET OF

   {                                     -- a My-type

       first 42

    } ,

   {                                     -- another My-type

       first 27 ,

       second 22 ,

       third "Everything set here"

   }

}                                        -- end of SET OF

ASN.1 Primitive Types Supported by AsnLib

Type,

Description

Specification

Value Notation

BOOLEAN

Any TRUE or FALSE value

May have a DEFAULT

Truth ::= BOOLEAN

Truth ::= FALSE

INTEGER

Any integer value.

May be given named values but range not limited to names.

May have a DEFAULT.

Number ::= INTEGER

or

Number ::= INTEGER {

     red (1) ,

     blue (2) }

Number ::= 42

or

Number ::= red

OCTET STRING

Any string of bytes.

Returned as or read from ByteStorePtr.

May not have DEFAULT.

Hstring ::= OCTET STRING

Hstring ::= '0A01F'H

NULL

null is only allowed value

Nothing ::= NULL

Nothing ::= null

REAL

Floating point number in base 2 or 10.

REAL value notation is 3 integers for { matissa, base, exponent }

May have a DEFAULT.

Pi ::= REAL

Pi ::= { 314159, 10, -5 }

ENUMERATED

A named set of integer values.

Only named values allowed.

May have a DEFAULT

Sex ::= ENUMERATED {

     male (1) ,

     female (2) }

Sex ::= male

SEQUENCE

A series of other named types, in order.

Not related to a biological sequence.

All elements must be present unless OPTIONAL or DEFAULT

Yuppie ::= SEQUENCE {

     income   INTEGER ,

     name     VisibleString }

Yuppie ::= {

     income 100000 ,

     name "John Doe" }

SEQUENCE OF

A repeating series of a single type in order.

Stooges ::=

SEQUENCE OF VisibleString

Stooges ::= {

     "Larry" ,

     "Curly",

     "Moe" }

SET

A series of named other types.

Order does not matter.

All elements must be present unless OPTIONAL or DEFAULT

Yuppie ::= SET {

     income   INTEGER ,

     name     VisibleString }

Yuppie ::= {

     income 100000 ,

     name "John Doe" }

SET OF

A repeating series of a single type. Order does not matter.

Stooges ::=

SET OF VisibleString

Stooges ::= {

     "Larry" ,

     "Curly",

     "Moe" }

CHOICE

A way to select one from a set of alternate types.

NOTE:  in the value notation you are indicating one choice, so {} are not necessary (or allowed) but the identifier for the selected CHOICE must be given before the value.

Person ::= CHOICE {

     social-security INTEGER ,

     name VisibleString ,

     badge-id INTEGER }

Person ::= name "Joe"

VisibleString

A string of printable ASCII characters

NOTE: The double quite character (") may be included in a VisibleString by doubling it.
"He said ""Hi Mom!"" to her"
NOTE: AsnLib can accept wrapped long VisibleStrings.  That is, a string may contain internal newlines which are stripped on input from the value notation.
 Text ::= "He said ""Hi Mom!"" to her"
would be read as:
"He said ""Hi Mom!"" to her"

Text ::= VisibleString

Text ::= "Hi Mom!"

StringStore

ONLY in AsnLib. Defines a VisibleString which is read into a ByteStore instead of a CharPtr. Used for long strings like DNA sequences.

Dna ::= StringStore

Dna ::= "AGGAGG"

Further information about ASN.1

 

The Open Book
A Practical Perspective on OSI
by Marshall T. Rose
Prentice Hall, Englewood Cliffs, New Jersey  07632
(c) 1990

 

ISO Development Environment  (public software)
University of Pennsylvania
Dept. of Computer Science and Information Science
Moore School
Attn: David J. Farber (ISODE Distribution)
200 South 33rd Street
Philadelphia, PA  19104-6314
1-215-898-8560

 

OSIkit Tools from NIST  (1989) (public software)
US Dept. of Commerce
National Institute of Standards and Technology
Gaithersburg, MD

 

Information Processing - Open Systems Interconnection - Specification of Abstract Syntax Notation One (ASN.1).  International Organization for Standardization and International Electrotechnical Committee, 1987. International Standard 8824.

 

Information Processing - Open Systems Interconnection - Specification of Basic Encoding Rules for Abstract Syntax Notation One (ASN.1).  International Organization for Standardization and International Electrotechnical Committee, 1987.  International Standard 8825.

 

Information Processing - Open Systems Interconnection - Abstract Syntax Notation One (ASN.1) - Draft Addendum 1:  Extensions to ASN.1.  International Organization for Standardization and International Electrotechnical Committee, 1987.  Draft Addendum 8824/DAD 1.

 

Information Processing - Open Systems Interconnection - Abstract Syntax Notation One (ASN.1) - Draft Addendum 1:  Extensions to ASN.1 Basic Encoding Rules.  International Organization for Standardization and International Electrotechnical Committee, 1987.  Draft Addendum 8825/DAD 1.

AsnLib: Overview

AsnLib is a library of functions developed by NCBI for manipulating and exchanging ASN.1 specifications and encoded data for scientific purposes.

A number of commercial and public domain tools are available for working with ASN.1 and for automatically building data handlers of various sorts. They are focused on the use for which ASN.1 was originally intended, the exchange of data between layers of the OSI.  As such they tend to automate the process more than AsnLib does, because the domain of use is much more limited.  The fact that they determine the internal data structures to use and write all the code to handle them themselves is not a big problem in this case.

When ASN.1 is used for scientific data description though, other uses will be made of the encoded data than may have originally been envisaged by the designers of these products.  For example, a scientist will often want an application which scans through a large complicated data structure, and just extracts certain fields for use, or even just counts occurrences of certain values.  A tool which automatically generates large elaborate data structures and lots of code to parse the stream, generate the structures, and store them in memory is inappropriate for such an application.  Further, a scientific application may well wish to manipulate that data in a different language than the tool is written in, such as FORTRAN, PROLOG, or LISP.  These applications may well wish to store the whole data structure from the stream, but they will not wish to use the data structures provided by the tool.

ASN.1 can be used to encode data in two ways, an ASCII human readable form called "value notation" or "print form", and a binary encoding.  ASN.1 has separate standards documents for the syntax (specification rules) and the binary encoding rules (BER, or "Basic Encoding Rules").  This was done on purpose to allow various encoding rules for the same abstract syntax.  The BER is, at this writing, the only official ISO encoding for ASN.1, but several other encodings which are faster or take less space, are under consideration by ISO.  Currently the only binary encoding AsnLib supports is BER.

The value notation or ASCII form of the data is not really an official ISO standard.  It was meant to provide a human readable form of ASN.1 data for development or explication, but not as a standard for data exchange. Nonetheless, value notation rules are given in the ISO documents for all the data types they describe.  With only a few additional rules, value notation is quite robust for data exchange.  These rules are listed in Appendix 1. While we do not recommend the ASCII form of ASN.1 encoded data for large amounts of data, it is very useful for developing and testing data representations or for generating ASN.1 values easily from other data files or local databases without specialized tools.  Since the value notation and binary encoded forms of data are completely and reliably interconvertable using AsnLib, there is no problem doing this.

Principles of Operation

AsnLib operates on atomic elements of ASN.1 specified data.  It is built using the NCBI core software tools and this document assumes you have some acquaintance with them.  AsnLib reads or writes strings, integers, etc. with single function calls.  Composite objects such as a SEQUENCE or a SET are read or written with a series of calls to read or write its component parts.  The process is designed to be relatively intuitive even in this case.  One calls a function to start encoding a SEQUENCE, then calls the routines to encode its parts, then calls a function to end encoding the SEQUENCE. NCBI has built functions to read and write such higher level objects in single function calls (described in the chapters on data), which use the low level AsnLib functions described here.

One can read and write any type using only three functions.  They take as arguments the identifier of an ASN.1 encoded stream (binary or ASCII), a pointer to a node in a parse tree (generated from the ASN.1 specification), and a pointer to a union which can hold a value of any type.  All aspects of how to encode a value properly, error checking to be sure that all appropriate nodes in the tree are visited in the proper order and that values are valid for a particular type are all taken care of within AsnLib and are not the concern of application programmer.  The application programmers must read and understand the ASN.1 specification to make proper use of it, but all the other details of using ASN.1 correctly are not their concern.

The parse tree contains information about the type of every node, its name, its binary tag, allowed values, default values, and the next valid element. The header file also contains a series of #defines which associate names derived from the ASN.1 specification with pointers to nodes in the parse tree. Thus one's code would refer to JOURNAL_title, not a pointer to a specific node.  Using these defines means that if an ASN.1 specification is changed, but the names and types of nodes an application cares about have not changed, the application can be updated by just compiling with the new header file.

There are also functions which allow more interpreter-like code to be written.  One function will load an ASN.1 specification from a file, validate it, and build the appropriate parse tree on the fly, rather than at compile time by including a header file.  One can still identify nodes in the tree by name with a function that searches the tree for nodes with names matching a string.  As with all interpreter/compiler trade offs, such an application is slower, but more flexible.

AsnLib assumes that specifications will be written as a collection of smaller modules.  Data types may be declared as IMPORTS or EXPORTS by any module.  Multiple modules which reference each other may be loaded at once into AsnLib or through the interpreter function described above.  It will then link the modules before outputing the header file, thus effectively building a single parse tree containing all the modules.

In another approach, one might build a series of functions which handle the datatypes in a particular module.  Then when one writes code which uses a module which IMPORTS another module type, it is left unlinked in that parse tree and one just calls the appropriate function to read it.  AsnLib contains two functions for temporarily linking, then unlinking local parse subtrees to a parent object parse tree for this purpose.  We have begun to build a library of such modular object functions, so one need not link the whole world of possible datatypes into a single routine or module, or write the basic routines to create, destroy, and exchange such sub-objects.

Specification for AsnLib

AsnLib supports the following types from ISO 8824 and the ASN.1 enhancements.  The internal representation used by AsnLib (from the NCBI core tools) for routines dealing with these types is also shown.

Supported ASN.1 primitive types

type                                

internal representation                

BOOLEAN

Boolean          

INTEGER

Int4

OCTET STRING

ByteStorePtr

NULL

no value

REAL

FloatHi

ENUMERATED

Int4

SEQUENCE

no value

SEQUENCE OF

no value

SET

no value

SET OF

no value

CHOICE

no value

VisibleString

CharPtr

StringStore

ByteStorePtr

Other ASN.1 string types are supported as VisibleString.  No checks are made to ensure restrictions of character usage by the various string types. Types not supported by AsnLib at this point (although they will be accepted in a module specification as valid ASN.1) are:

Unsupported ASN.1 primitive types

BIT STRING

OBJECT IDENTIFIER

ObjectDescriptor

EXTERNAL

ANY

GeneralizedTime

UTCTime

The following keywords are currently supported by AsnLib:

Supported ASN.1 keywords

DEFINITIONS

BEGIN

END

EXPORTS

IMPORTS

FROM

APPLICATION

PRIVATE

UNIVERSAL

DEFAULT

OPTIONAL

FALSE

TRUE

The following ASN.1 keywords are not supported by AsnLib (although they are passed in a module specification as valid ASN.1):

Unsupported ASN.1 keywords

IMPLICIT

ABSENT

BY

COMPONENT

DEFINED

INCLUDES

MIN

MINUS-INFINITY

MAX

PRESENT

PLUS-INFINITY

SIZE

TAGS

WITH

AsnLib uses indefinite encoding for output of all binary encoded non‑ primitive types.  It can decode either definite or indefinite binary encoded data for all types.  This conforms to the BER.

DEFAULT values may be given in an ASN.1 specification.  AsnLib accepts and records them in the parse tree.  However, it does not supply the value if it is missing from the input stream on the assumption that the application would want to distinguish a value actually supplied from a value defaulted locally. DEFAULT is only supported for simple types like INTEGER or VisibleString, but not for structure types like SEQUENCE because it is too difficult to code.

Values may not be assigned in a specification module to types defined in a different module.  Each module is self contained and does not "know" anything about types defined in other modules except their names if they were IMPORTS. So suppose one module defines:

 

Dna-strand ::= ENUMERATED { plus(1), minus(2) }

 

A different module may not use the DEFAULT in the following case:

 Dna-sequence ::= SEQUENCE {

   length INTEGER ,

   strand Dna-strand DEFAULT plus }

 

because it does not know Dna-strand is ENUMERATED or what its allowed values are.  Such a construct is acceptable if the definition of Dna-strand and Dna‑ sequence are in the same module and the Dna-strand definition comes first.

Elements of a SEQUENCE are checked that they are all received or sent in the correct order and that no non-OPTIONAL or non-DEFAULT elements are missing.  However, because AsnLib does not store whole structures, it can only check that the types of elements in a SET are correct, but cannot check if more than one element of a type is used or if a required element is missing.  For this reason it is safer to use SEQUENCE rather than SET as a rule when using AsnLib.  While there is a semantic difference, there is no representational limitation in doing this.

AsnTool

An application program called "asntool", is built by the NCBI Software Toolkit using the AsnLib function libraries, which in turn are based on the NCBI portable core software tools. This application is a utility program which can:

1.             Read, write, and error check an ASN.1 specification.

2.             Read, write, and check ASCII values conforming to the specification in 1.

3.             Read, write, and check binary values conforming to the specification in 1.

4.             Combinations of 2 and 3 to translate or convert between binary and ASCII

5.             Output a C language header file which contains a parse tree for specification 1 which can be used in an application program.

AsnTool Tutorial

It may be quickest to demonstrate the use of AsnLib through example.  In the distribution directory of the NCBI Software Toolkit, \ncbi, there are two subdirectories. \demo contains demonstration source code to be used in the section below and 2 samples of MEDLINE entries as ASN.1 value notation (ASCII).  medline.ent is a single Medline-entry and medline.prt is a Pub-set containing many MEDLINE entries. \asn contains the ASN.1 specifications for the modules used to describe the MEDLINE entries.  They are:

File           

Module               

Description                           

general.asn

NCBI-General

general purpose data types

pub.asn

NCBI-Pub

branch point for various publication types.

biblio.asn

NCBI-Biblio

standard bibliographic citations for journals, books, manuscripts, patents based on ANSI standard

medline.asn

NCBI-Medline

MEDLINE entry (based on NCBI-Biblio)

asnpub.all

all

all above modules in one file

asntool should have been built as part of installing the system.  It is in \ncbi\bin.  Set your path, or move asntool to a place it can be executed.

From within the \demo directory, run asntool with no arguments.  It presents its argument usage to you.  Note that you must always give a module file name. asntool takes only one module file, so if you wish to use more than one you must concatenate them into a single file, such as asnpub.all.

Try the following exercises -- type:

 

asntool -m ..\asn\asnpub.all

 

This will read the publication modules and validate that they are correctly built.  asntool will notify you of various syntax errors and typos, usually giving the line number where the error occurred.  It makes sure that everything EXPORTS from a module is defined in that module and that everything IMPORTS is used by that module.  Everything not IMPORTS must be defined within the module.  In the case of multiple modules, it will try to link EXPORTS from one module with IMPORTS from others.  It is not an error to be unable to link an IMPORTS, but it does imply you expect it to be handled by an outside function.  There are no errors in asnpub.all, so asntool is silent.  The path may have a different form on various machines.

 

asntool -m ..\asn\asnpub.all -v medline.ent

 

This does everything above, and then reads the file medline.ent which it expects to be of a type defined in asnpub.all.  It checks for errors, reporting any it finds.  There are none, so asntool is silent.

 

asntool -m ..\asn\asnpub.all -v medline.ent -p stdout

 

On command line systems, everything above will happen, except that medline.ent will be encoded from asntool's internal structures to ASN.1 value notation on stdout, your terminal.  On Macintosh or Microsoft Windows, the output will go to a disk file named "stdout".

 

asntool -m ..\asn\asnpub.all -v medline.prt -e medline.val

 

This reads the set of MEDLINE records from medline.prt and encodes them in binary ASN.1 in the file medline.val

 

asntool -m ..\asn\asnpub.all -d medline.val -t Pub-set -p stdout

 

This reads (decodes) the set of MEDLINE records from the binary ASN.1 file we just made and outputs them as value notation on stdout.  Note that we MUST specify the type (Pub-set) of the binary file or message.  That is because the binary form does not have that information.  The value notation form does, so asntool can figure it out, but the binary, which is the real ISO standard, does not.

 

asntool -m ..\asn\asnpub.all -o allpub.h

 

This outputs a header file for an application which will use the asntool routines to encode and decode objects defined in asnpub.all.

Using AsnLib

If you take a look at the allpub.h you generated above, you will see that it includes <asn.h> which defines the interface to the AsnLib library and which includes <ncbi.h> which defines the interface to the NCBI core software tools.

Then the arrays of structures defining the parse tree come.  You should never program directly for these structures as they may change without notice. You should always use the functions described below.

Last come the #defines for pointers to specific nodes in the parse tree. They are built from the names of objects specified in the ASN.1 modules.  The name of the type itself is upper case, and component parts are in lower case. An example of the mapping between the ASN.1 specification medline.asn and the #defines in allpub.h is shown in Appendix 2. 

One less intuitive aspect of this system applies only to SET OF or SEQUENCE OF which are repeating series of the same type.  Since any one element of such a repeating series does not have a name, one must be invented.  This is done by appending a _E (for Element of) to the parent name (e.g.. if Name-list ::= SEQUENCE OF VisibleString, then one name (VisibleString) of that SEQUENCE OF would have a #defined node name of NAME_LIST_E).  Names defined this way are limited to a maximum of 31 characters.  If they grow longer than that, the leftmost characters are truncated.  The suggestion is: keep names as short as you can and still be meaningful.  Also, since "-" is the only valid separator character in ASN.1 but "_" is the only valid separator character in C, the Name-list (mentioned above) node in the parse tree would be defined as NAME_LIST.

ASN.1 encoded values are represented basically as identifier/value pairs. AsnLib has two parsing functions that correspond to the members of the pair:

atp = AsnReadId(aip, amp, atp);

    Reads an identifier from an input stream (aip) and returns a pointer to the appropriate node in the parse tree for it (atp as the return value).  atp will be one of the nodes #defined in the header generated by AsnLib.

 

success = AsnReadVal(aip, atp, avp);

      Reads the value of atp from the stream (aip) into an AsnValue (a union of Pointer, Int4, Boolean, FloatHi).  If AsnReadVal() is called with avp = NULL, it skips over that value.  This is useful for scanning through a file extracting only a few fields.

To parse then, one basically just alternates AsnReadId() and AsnReadVal(). The most common error to make in writing a parser that uses these functions is to get out of synchronization alternating between these two routines.

There is only one function to write an identifier/value pair at once:

success = AsnWrite(aip, atp, avp);

      Writes the identifier pointed to by atp, and the value in avp, to the stream aip.

AsnLib: A Tutorial

In \ncbi\demo are three small demo applications that process medline entries and require the allpub.h header and the binary form of medline.prt we built in the sections above.  The make files for Microsoft C (makedemo.msc) and for all UNIX systems (makedemo.unx) are in \make.  Copy the makedemo file appropriate for your system into \ncbi\build and make it.

getmesh.c

Function:  Reads a Medline-entry, extracts the MeSH terms, and prints them.

Type "getmesh -" to see its arguments.

Type "getmesh -i medline.ent -o terms.out".  getmesh reads medline.ent, which contains a single Medline-entry in value notation (ASCII).  This file is presented at the end of this chapter, somewhat abbreviated, with the #defined names for the nodes in the allpub.h parse tree that will be encountered in the course of reading this file. getmesh parses it, extracts the MeSH terms and prints them in "terms.out".

Look at the source code in getmesh.c.

/*****************************************************************************

*

*   getmesh.c

*      gets mesh terms from a Medline-entry

*

*****************************************************************************/

#include <allpub.h>

 

#define NUMARGS 3

Args myargs[NUMARGS] = {

   { "Input data", NULL, "Medline-entry", NULL, FALSE, 'i', ARG_DATA_IN, 0.0,0,NULL},

   { "Input data is binary", "F", NULL, NULL, TRUE , 'b', ARG_BOOLEAN, 0.0,0,NULL},

   { "Output list", NULL, NULL, NULL, FALSE, 'o', ARG_FILE_OUT, 0.0,0,NULL}};

 

Int2 Main()

{

   AsnIoPtr aip;

   AsnTypePtr atp;

   DataVal value;

   static CharPtr intypes[2] = { "r", "rb" };

   Int2 intype;

   FILE *fp;

 

    if (! AsnLoad())

        Message(MSG_FATAL, "Unable to load allpub parse tree.");

 

    if (! GetArgs("GetMesh 1.0", NUMARGS, myargs))

       return 1;

 

   if (myargs[1].intvalue)        /* binary input is TRUE */

       intype = 1;

   else

       intype = 0;

 

   if ((aip = AsnIoOpen(myargs[0].strvalue, intypes[intype])) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[0].strvalue);

       return 1;

   }

 

   if ((fp = FileOpen(myargs[2].strvalue, "w")) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[2].strvalue);

       return 1;

   }

 

   atp = MEDLINE_ENTRY;

 

   fprintf(fp, "MeSH terms =\n\n");

   while ((atp = AsnReadId(aip, amp, atp)) != NULL)

   {

       if (atp == MEDLINE_MESH_term)

       {

          AsnReadVal(aip, atp, &value);

          FilePuts(value.ptrvalue, fp);

          FilePuts("\n", fp);

          AsnKillValue(atp, &value);

       }

       else

          AsnReadVal(aip, atp, NULL);

   }

 

   aip = AsnIoClose(aip);

 

   FileClose(fp);

 

   return 0;

}

Pretty short for doing all this, isn't it?  Walking through the code:

0.             AsnLoad() is called to load the ASN.1 parse tree for "allpub" into memory.

1.             GetArgs() is called to display or get the command line arguments.

2.             The appropriate string is selected for opening a value notation ("r") or a binary ("rb") input stream.

3.             The input stream is opened with AsnIoOpen().

4.             The file for printed output is opened.

5.             atp is initialized to MEDLINE_ENTRY, the defined node we expect the input stream to start with.  If the input stream were ALWAYS value notation, atp could be set to NULL, and Medline-entry ::= would be read from the input file and atp set correctly.  Since getmesh takes binary and value notation, atp must be properly initialized.

6.             The main while loop just reads identifiers with AsnReadId() until it returns NULL, which is EOF.  The argument, amp, is the AsnModulePtr declared in allpub.h.  It is used to locate the appropriate AsnTypePtr (atp) if it was set to NULL on the first call.  After that, atp provides the link to the parse tree.

7.             In the while loop, a check is made to see if atp == MEDLINE_MESH_term, or the VisibleString containing a single MeSH term.  If so, we read the value with AsnReadVal(), print it, then call AsnKillValue() which will deallocate any storage used when any data type is read.  Since a VisibleString requires storage this is necessary.  There is no harm in calling AsnKillValue() even on types that do not allocate storage (e.g.. INTEGER).

8.             If it's not a MeSH term, we call AsnReadVal() with a NULL argument for the AsnValuePtr, which just skips over the value to the next identifier.

9.             We close the streams.

10.          c'est tout.

indexpub.c

Function: Builds an index to medline.ent base on Medline Unique Identifier.

Type "indexpub -" to see the arguments.

Type "indexpub -imedline.val".  indexpub will read the binary value file, medline.val, note the seek offset of the start of each Medline-entry it contains, identifies the Medline uid for it, and builds an index file, "medline.idx".

Take a look at the source code, indexpub.c.

/*****************************************************************************

*

*   indexpub.c

*      indexes a Pub-set by Medline UID

*

*****************************************************************************/

#include <allpub.h>

 

#define NUMARGS 3

Args myargs[NUMARGS] = {

   { "Input data", "medline.val", "Pub-set", NULL, FALSE, 'i', ARG_DATA_IN, 0.0,0,NULL},

   { "Input data is binary", "T", NULL, NULL, TRUE , 'b', ARG_BOOLEAN, 0.0,0,NULL},

   { "Output index table", "medline.idx", NULL, NULL, FALSE, 't', ARG_FILE_OUT, 0.0,0,NULL}};

 

 

Int2 Main()

{

   AsnIoPtr aip;

   AsnTypePtr atp;

   DataVal value;

   Int4 seekptr, tempseek, uid;

   static CharPtr intypes[2] = { "r", "rb" };

   Int2 intype;

   FILE *fp;

 

    if (! AsnLoad())

        Message(MSG_FATAL, "Unable to load allpub parse tree.");

 

   if (! GetArgs("IndexPub 1.0", NUMARGS, myargs))

       return 1;

 

   if (myargs[1].intvalue)        /* binary input is TRUE */

       intype = 1;

   else

       intype = 0;

 

   if ((aip = AsnIoOpen(myargs[0].strvalue, intypes[intype])) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[0].strvalue);

       return 1;

   }

 

   if ((fp = FileOpen(myargs[2].strvalue, "w")) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[2].strvalue);

       return 1;

   }

 

   atp = PUB_SET;

   tempseek = 0L;

 

   while ((atp = AsnReadId(aip, amp, atp)) != NULL)

   {

       if (atp == PUB_SET_medline_E)

          seekptr = tempseek;

       if (atp == MEDLINE_ENTRY_uid)

       {

          AsnReadVal(aip, atp, &value);

          uid = value.intvalue;

          fprintf(fp, "%ld %ld\n", uid, seekptr);

       }

       else

          AsnReadVal(aip, atp, NULL);

       tempseek = AsnIoTell(aip);

   }

 

   aip = AsnIoClose(aip);

   FileClose(fp);

   return 0;

 

}

It is the same basic structure as getmesh.c.  However, the use of the while loop is a little different.  Since we are building an index, we want to record the offset in the file of the identifier which starts each medline entry in the Pub-set (PUB_SET_medline_E ‑- a PUB_SET of type medline is a SET OF Medline-entry).  So tempseek is set (to 0 to begin with, then with AsnIoTell()) BEFORE each read of an identifier with AsnReadId().  When the return value is PUB_SET_medline_E we know that tempseek contains the seek offset just before the first identifier for the Medline-entry.  Then we read through the entry looking for the MEDLINE_ENTRY_uid since we want to index on the MEDLINE Unique Identifier. When we find it, we store the seek offset and the uid in the index file.  All other values are skipped.

getpub.c

Function: Uses the index created by indexpub.c to retrieve a Medline-entry from medline.val by Medline uid.

/*****************************************************************************

*

*   getpub.c

*      does an indexed lookup for medline entries by medline uid

*

*****************************************************************************/

#include "allpub.h"

 

#define NUMARGS 5

Args myargs[NUMARGS] = {

   { "Input binary data", "medline.val", "Pub-set", NULL, FALSE, 'i', ARG_DATA_IN, 0.0,0,NULL},

   { "Medline UID to find", "88055872", NULL,NULL,FALSE,'u', ARG_INT, 0.0, 0, NULL },

   { "Input index table", "medline.idx", NULL,NULL,FALSE,'t', ARG_FILE_IN, 0.0,0,NULL },

   { "Output data", "stdout", "Medline-entry",NULL,FALSE,'o',ARG_DATA_OUT, 0.0,0,NULL},

   { "Output data is binary", "F", NULL, NULL, FALSE , 'b', ARG_BOOLEAN, 0.0,0,NULL}};

 

 

Int2 Main()

{

   AsnIoPtr aip, aipout;

   AsnTypePtr atp;

   DataVal value;

   Int4 seekptr, uid, uid_to_find;

   static CharPtr outtypes[2] = { "w", "wb" };

   Int2 outtype;

   FILE *fp;

   Boolean done, first;

   int retval;

 

    if (! AsnLoad())

        Message(MSG_FATAL, "Unable to load allpub parse tree.");

 

   if (! GetArgs("GetPub 1.0", NUMARGS, myargs))

       return 1;

 

   if (myargs[4].intvalue)        /* binary output is TRUE */

       outtype = 1;

   else

       outtype = 0;

 

   if ((aip = AsnIoOpen(myargs[0].strvalue, "rb")) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[0].strvalue);

       return 1;

   }

 

   if ((aipout = AsnIoOpen(myargs[3].strvalue, outtypes[outtype])) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[3].strvalue);

       return 1;

   }

 

   if ((fp = FileOpen(myargs[2].strvalue, "r")) == NULL)

   {

       Message(MSG_ERROR, "Couldn't open %s", myargs[2].strvalue);

       return 1;

   }

 

   uid_to_find = myargs[1].intvalue;

   done = FALSE;

   first = TRUE;

   while (! done)

   {

       retval = fscanf(fp, "%ld %ld", &uid, &seekptr);

       if (retval == EOF)

       {

          Message(MSG_ERROR, "UID %ld not found", uid_to_find);

          return 1;

       }

       if (uid == uid_to_find)

          done = TRUE;

   }

   FileClose(fp);

 

   atp = MEDLINE_ENTRY;

   AsnIoSeek(aip, seekptr);

   done = FALSE;

   while (! done)

   {

       atp = AsnReadId(aip, amp, atp);

       AsnReadVal(aip, atp, &value);

       AsnWrite(aipout, atp, &value);

       AsnKillValue(atp, &value);

 

       if (! first)

       {

          if (atp == MEDLINE_ENTRY)

              done = TRUE;

       }

       else

          first = FALSE;

   }

 

   AsnIoClose(aip);

   AsnIoClose(aipout);

 

   return 0;

}

This is a very simple program.  It looks up the seek offset into the file by uid, and seeks to that point with AsnIoSeek().  It then just cycles through the process of reading an identifier then reading a value from medline.val using AsnReadId() and AsnReadVal().  It then writes them both to the output file with AsnWrite().  Any storage used is freed with AsnKillValue(). Depending on the way the output AsnIo stream is opened, ASCII or binary, the program can deliver a binary Medline-entry or an ASCII conversion of it.

One important point to note is that the way the while loop knows when it has finished reading a MEDLINE_ENTRY.  Since it is a SEQUENCE which is basically a structure with component parts, AsnReadId() returns atp == MEDLINE_ENTRY twice.  Once when it reads the start of the structure, and once when it reads the end.  If you imagine the MEDLINE_ENTRY being bounded by braces {} as in the value notation the process is this:

MEDLINE_ENTRY ::= { AsnReadId() gets MEDLINE_ENTRY, AsnReadVal() gets {

    one ,                         { read the internal components )

    two

   }                AsnReadId() gets MEDLINE_ENTRY, AsnReadVal() gets }

To produce the same effect on output, there are two extra output functions for AsnLib, in addition to AsnWrite().

AsnOpenStruct(aip, atp, ptr)

                Writes the first instance of atp on the output stream aip at the beginning of a structure (SEQUENCE, SET, SEQUENCE OF, SET OF).

 

AsnCloseStruct(aip, atp, ptr)

                Writes the second, closing instance.

The "ptr" argument is a pointer to the internal C structure representing the ASN.1 structure. It is used by functions that piggyback on the AsnWrite functions to explore the internal objects (discussed below).

For this reason a similar function is provided to write a CHOICE.

AsnWriteChoice(aip, atp, choice, value)

                Writes a choice of types. The choice argument is an integer to indicate which type will be written at the next AsnWrite(), and value is a DataVal in which can be passed the internal C structure used to represent the choice.

 In the case of getpub.c, it is not necessary to call these functions because getpub is simply reading the data from an ASN.1 stream then writing it again in order, which includes the two instances of MEDLINE_ENTRY.

Another point about this program is that we recognized the Medline entries in the Pub-set in indexpub.c by looking for PUB_SET_medline_E, but we are reading and writing the same entry in getpub.c using MEDLINE_ENTRY.  That is because a Pub-set of CHOICE medline is defined as a SET OF Medline-entry.  So when reading the whole Pub-set, each Medline-entry is a PUB_SET_medline_E. But when reading one entry it is a MEDLINE_ENTRY.

Data-links

Data-links are described in the NCBI Core Tools document.  They are meant to be "ports" in and out of software applications which perform exchange of structured data (in ASN.1).  The inputs and outputs for getpub.c and getmesh.c are actually Data-links.  If you simply type the command:

 

getpub -u 88055872 -b -o stdout | getmesh -i stdin -b -o terms.out

 

you have executed a pair of programs which communicate over a Data-link with structured, binary encoded ASN.1.  getpub extracts a Medline-entry with uid = 88055872 from a binary encoded Pub-set by indexed look-up, transfers it out stdout as a Medline-entry in binary, to getmesh which parses the "message" and locates MeSH terms, and prints them to test.out.

This example is just a pipe between two programs, with the enhancement that the stream is binary coded ASN.1, which permits a very much richer "vocabulary" for the exchange than is usual for traditional pipes.  Further, since binary coded ASN.1 is a machine independent coding, the exchange could just as easily been between two completely different machines over a network. Finally, this pipe is a single channel of exchange.  The principles hold if one expands the system to many channels, by a variety of means.

AsnLib Generated Header Files

Correspondence between ASN.1 and header #defines

Medline-entry ::= SEQUENCE {                    MEDLINE_ENTRY

   uid INTEGER ,                                      MEDLINE_ENTRY_uid

   em Date ,                                          MEDLINE_ENTRY_em

   cit Cit-art ,                                      MEDLINE_ENTRY_cit

   abstract VisibleString OPTIONAL ,                  MEDLINE_ENTRY_abstract

   mesh SET OF Medline-mesh OPTIONAL ,                MEDLINE_ENTRY_mesh

   substance SET OF Medline-rn OPTIONAL ,             MEDLINE_ENTRY_substance

   xref SET OF Medline-si OPTIONAL ,                  MEDLINE_ENTRY_xref

   idnum SET OF VisibleString OPTIONAL }        MEDLINE_ENTRY_idnum

 

Medline-mesh ::= SEQUENCE {              MEDLINE_MESH

   mp BOOLEAN DEFAULT FALSE ,                         MEDLINE_MESH_mp

   term VisibleString ,                               MEDLINE_MESH_term

   qual SET OF Medline-qual OPTIONAL }                MEDLINE_MESH_qual

Returns From AsnLib Parsing

Medline-entry with header #defines as returned when parsing with AsnLib

Medline-entry ::= {                    /MEDLINE_ENTRY

  uid 88055872 ,                      |   MEDLINE_ENTRY_uid

  em                                  |   MEDLINE_ENTRY_em

    std {                             |    /DATE_std

      year 1988 ,                     |   |   DATE_STD_year

      month 3                         |   |   DATE_STD_month

    } ,                               |    \DATE_std

  cit {                               |  /MEDLINE_ENTRY_cit

    title {                           | |  /CIT_ART_title

      name "Developmental .. protein."| | |   TITLE_name

    } ,                                | |  \CIT_ART_title

    authors {                         | |  /CIT_ART_authors

      names                           | | |  AUTH_LIST_names

        ml {                          | | |   /AUTH_LIST_names_ml

          "Giebel LB" ,               | | |  |   AUTH_LIST_names_ml_E

          "Dworniczak BP" ,           | | |  |   AUTH_LIST_names_ml_E

          "Bautz EK"                  | | |  |   AUTH_LIST_names_ml_E

        } ,                           | | |   \AUTH_LIST_names_ml

      affil                           | | |    AUTH_LIST_affil

        str "Zentrum ... Germany"     | | |      AFFIL_str

    } ,                               | |  \CIT_ART_authors

    from                              | |   CIT_ART_from

      journal {                       | |    /CIT_ART_from_journal

        title {                       | |   |  /CIT_JOUR_title

          ml-jta "Dev Biol"           | |   | |   TITLE_ml_jta

        } ,                           | |   |  \CIT_JOUR_title

        imp {                         | |   |  /CIT_JOUR_imp

          date                        | |   | |   IMPRINT_date

            std {                     | |   | |    /DATE_std

              year 1988 ,             | |   | |   |   DATE_STD_year

              month 1                 | |   | |   |   DATE_STD_month

            } ,                       | |   | |    \DATE_std

          volume "125" ,              | |   | |   IMPRINT_volume

          issue "1" ,                 | |   | |   IMPRINT_issue

          pages "200-7"               | |   | |   IMPRINT_pages

        }                             | |   |  \CIT_JOUR_imp

      }                               | |    \CIT_ART_from_journal

  },                                  |  \MEDLINE_ENTRY_cit

  abstract "Multiple ... protein." ,  |   MEDLINE_ENTRY_abstract

  mesh {                              |  /MEDLINE_ENTRY_mesh

    {                                 | |  /MEDLINE_ENTRY_mesh_E

      term "Amino Acid Sequence"      | | |   MEDLINE_MESH_term

    } ,                               | |  \MEDLINE_ENTRY_mesh_E

    {                                  | |  /MEDLINE_ENTRY_mesh_E

      term "Clathrin" ,               | | |   MEDLINE_MESH_term

      qual {                          | | |  /MEDLINE_MESH_qual

        {                             | | | |  /MEDLINE_QUAL

          subh "metabolism"           | | | | |   MEDLINE_QUAL_subh

        }                             | | | |  \MEDLINE_QUAL

      }                               | | |  \MEDLINE_MESH_qual

    } ,                               | |  \MEDLINE_ENTRY_mesh_E

    {                                 | |  /MEDLINE_ENTRY_mesh_E

      term "Heat-Shock Proteins" ,    | | |   MEDLINE_MESH_term

      qual {                          | | |  /MEDLINE_MESH_qual

        {                             | | | |  /MEDLINE_QUAL

          mp TRUE ,                   | | | | |   MEDLINE_QUAL_mp

          subh "genetics"             | | | | |   MEDLINE_QUAL_subh

        }                             | | | |  \MEDLINE_QUAL

      }                               | | |  \MEDLINE_MESH_qual

    }                                 | |  \MEDLINE_ENTRY_mesh_E

  } ,                                 |  \MEDLINE_ENTRY_mesh

  substance {                         |  /MEDLINE_ENTRY_substance

    {                                 | |  /MEDLINE_substance_E

      type cas ,                      | | |   MEDLINE_RN_type

      cit "9007-49-2" ,               | | |   MEDLINE_RN_cit

      name "DNA"                      | | |   MEDLINE_RN_name

    }                                 | |  \MEDLINE_substance_E

  } ,                                 |  \MEDLINE_ENTRY_substance

  xref {                              |  /MEDLINE_ENTRY_xref

    {                                 | |  /MEDLINE_ENTRY_xref_E

      type genbank ,                  | | |   MEDLINE_SI_type

      cit "M19141"                    | | |   MEDLINE_SI_cit

    }                                 | |  \MEDLINE_ENTRY_xref_E

  }                                   |  \MEDLINE_ENTRY_xref

}                                      \MEDLINE_ENTRY

Finding AsnTypePtrs at Run-time

The #defines described above are statically defined in a header file. But sometimes one must find the parse tree nodes (asntypes) from a module which does not include the parse tree itself. If all parse trees have been loaded using the AsnLoad() functions in the modules that include the parse trees, then they are globally accessible by name through a number of functions. AsnFind() takes a string with the name of an ASN.1 specified entity or a partial path (sub-entities separated by dots) to the entity and returns a pointer to its type node. For example,

AsnTypePtr atp;

 

   atp = AsnFind("Seq‑entry.location");

will return the same pointer #defined as SEQ_ENTRY_location in the parse tree header file.

Other functions will return information about types at run-time. Using the atp obtained above for Seq-entry.location, which is a "Seq-loc", which is itself defined as the primitive type CHOICE:

CharPtr str;

 

   str = AsnFindPrimName(atp);    /* returns "CHOICE" */

   str = AsnFindBaseName(atp);    /* returns "Seq-loc"  */

For an ENUMERATED type one can get the values at run-time. For the ASN.1 specification:

Sex ::= ENUMERATED {

   male (1) ,

   female (2) };

the following code can be used:

AsnTypePtr atp;

CharPtr str;

 

   atp = AsnFind("Sex");

   str = AsnEnumTypeStr(atp, 2);     /* returns "female" */

   str = AsnEnumStr("Sex", 2);       /* also returns "female" */

Custom Read and Write Functions

The AsnLib read and write functions can be replaced to provide custom I/O using the AsnIoNew() function. This is how the NCBI network client/servers are implemented, by replacing the read and write functions with socket based routines. We have also used it to write blocks of ASN.1 in memory buffers for transfer in and out of databases. This is not normally something done by a novice, but several functions which read and write to memory are given in the toolkit as models of how to do this sort of thing.

   /*** read and write to memory buffer ***/

extern AsnIoMemPtr AsnIoMemOpen PROTO((CharPtr mode, BytePtr buf, Uint2 size));

extern AsnIoMemPtr AsnIoMemClose PROTO((AsnIoMemPtr aimp));

extern Boolean AsnIoMemReset PROTO((AsnIoMemPtr aimp, Uint2 bytes_to_read));

extern Int2 AsnIoMemRead PROTO((Pointer, CharPtr, Uint2));

extern Int2 AsnIoMemWrite PROTO((Pointer, CharPtr, Uint2));

 

   /*** read and write to a ByteStore in memory ***/

extern AsnIoBSPtr AsnIoBSOpen PROTO((CharPtr mode, ByteStorePtr bsp));

extern AsnIoBSPtr AsnIoBSClose PROTO((AsnIoBSPtr aibp));

extern Int2 AsnIoBSRead PROTO((Pointer, CharPtr, Uint2));

extern Int2 AsnIoBSWrite PROTO((Pointer, CharPtr, Uint2));

 

Customizing an AsnIo Stream

Sometimes one wishes to change the details of a series of functions at run-time. This can be accomplished by attaching AsnOption structures to the stream. These form a linked list of structures which carry user defined data and are identified by user defined class and type values. A series of functions allow the options to be added, removed, or located on a stream pointer. These are used to customize the behavior of the object loaders (see below) under different run-time conditions, but have many other uses as well. AsnOptions are not the same as AsnExpOptStructs, or exploration structures used by the generalized iterator described below.

ASN.1 Object Loaders

About the only time it is efficient to read the lower level ASN.1 raw values is when there are just a few types of simple values that one is interested in processing.  For example, if one wanted to record the relative occurrence of journal titles in some particular ASN.1 file, one could find those without worrying about the objects.  However, most of the time it is much more convenient to load all or a portion of the ASN.1 information into C code structured objects. 

In general, when the ASN.1 stream is positioned at the beginning of a structure, one can call the <OBJECT>AsnRead function (replacing "<OBJECT>" with some object name) which returns a pointer of the <OBJECT>'s type to an allocated structure.  This structure can then be processed within the C code.  To use these objects, it is convenient to know both the ASN.1 definitions and the C structures, as well as any special function names which operate on them.  For this reason, these different kinds of format descriptions (ASN.1 definitions, C structure definitions, and function prototypes) all appear together, alphabetized by C code object type (if it exists, else using the ASN.1 definition) following this section.  For most objects, there are <OBJECT>New() functions which allocate memory and set any default values, <OBJECT>Free() functions, which release the memory, <OBJECT>AsnRead() and <OBJECT>AsnWrite functions for communication with the ASN.1 I/O stream.  These are true objects in that the upper level objects inherit the slots and "knowledge" about the lower level objects, so that when, for example, an <OBJECT>Free() routine is called which is composed of (recursively) other sub-objects, their <SUB-OBJECT>Free() functions are used as needed. The same type of behavior is exhibited on the <OBJECT>AsnRead() and <OBJECT>AsnWrite() functions since they called the appropriate <SUB-OBJECT>AsnRead() and <SUB_OBJECT>AsnWrite() functions as needed.

The <OBJECT>New() functions take no parameters, and return an <OBJECT>Ptr.  The <OBJECT>Free() functions take an <OBJECT>Ptr parameter, pointing to the object that is to be returned to the heap, and return a NULL pointer of the same type.  The <OBJECT>AsnRead() functions take a pointer to an AsnIo stream (not a FILE *) that was opened with AsnIoOpen() and an AsnTypePtr which points within the parse tree to the type of the Id whose value follows.   An example of what is meant by this follows:

 

if ( -- expect seqentry only ---){

   atp = SEQ_ENTRY;

   while ((atp = AsnReadId(AsnFp, my_amp, atp)) != NULL) {

       the_set = SeqEntryAsnRead(AsnFp, atp);

       /*--process the SeqEntry --*/

       SeqEntryFree(the_set);

   }

} else {

   /*---Expect a BioseqSet----*/

   atp = BIOSEQ_SET;

   while ((atp = AsnReadId(AsnFp, my_amp, atp)) != NULL) {

       if (atp == BIOSEQ_SET_seq_set_E) {

          /*------------

          * The "..._E" is the type of the element of the

          * seq-set.  Generally, when there  are repeating elements

          * of the same type, the "_E" type holds a place in the parse tree.

          *--------------*/

          the_set = SeqEntryAsnRead(AsnFp, atp);

 

          /*--process the SeqEntry --*/

          SeqEntryFree(the_set);

       } else {

          AsnReadVal(AsnFp,atp, &value);

       }

   }

}

 

An <OBJECT>Ptr (or NULL on some error conditions) is returned.  The <OBJECT>AsnWrite() functions take the same parameters as the <OBJECT>AsnRead() functions, with the addition of an <OBJECT>Ptr to the object to be added to the ASN.1 stream.  The return is a Boolean (TRUE on success, FALSE on failure).

In many cases, these standard functions are all that are needed. In some special cases additional functions for comparing, duplicating, or displaying objects are provided as well. The object loaders are discussed in the following chapters which describe the NCBI data objects themselves. Finally there are chapters on utility functions which perform more complex operations on these objects.

AsnLib and Object Loaders As a Generalized Iterator

The ability to scan a stream of data and identify and extract data items in a very general way just using their names as defined in their ASN.1 specification is a very powerful aspect of AsnLib functionality. Since the object loader xxxAsnWrite() functions must exhaustively traverse the internal C structures to write them out, and must "know" both the ASN.1 specified type of every structure and field, one can use these functions to create a generalized iterator for the object loader structures in memory.

One can create a "null" output AsnIoPtr (although this will work on a real AsnWrite as well) by using:

AsnIoPtr aip;

 

   aip = AsnIoNullOpen();

One can then associate a data type from the ASN.1 specification or a partial path in the ASN.1 specification where each element is separated by dots. "Seq-loc" is the Seq-loc object no matter what it's context. "Seq-feat.location" is a Seq-loc ONLY in the "location" slot of a Seq-feat. The Seq-feat itself can be in any context, since that is the top of the partial path. Whenever the object loader AsnWrite routine encounters a data item that satisfies the partial path, it can be made to call a user supplied callback function with arguments of a user defined data object and the data object that would be written. An AsnIoPtr can have as many of these options as desired. More than one callback can be associated with the same data type. More than one datatype can be associated with the same callback. Explore options are associate with a stream like this program which counts the features in a SeqEntry.:

   typedef struct mydata {

       Int2 counter;

   } Mydata, PNTR MydataPtr;

 

/*** counts features in a SeqEntry ***/

Int2 countfeats(SeqEntryPtr sep)

   MydataPtr localptr;

   AsnIoPtr aip;

   Int2 num;

 

   localptr = (MydataPtr)MemNew(sizeof(Mydata));

   localptr->counter = 0;

   aip = AsnIoNullOpen();

   AsnExpOptNew(aip, "Seq-feat", (Pointer)localptr, mycallback);

   SeqEntryAsnWrite(sep, aip, NULL);   /* object loader write */

   num = localptr->counter;

   MemFree(localptr);

   AsnIoClose(aip);

   return num;

}

 

void mycallback (AsnExpOptStructPtr aeosp)

{

   SeqFeatPtr sfp;

   MydataPtr mdp;

 

   /*** this will be called at both the beginning and end of writing */

    /**  a structure. Be sure we only act once (at the beginning) */

 

   if (aeosp->dvp->intvalue != START_STRUCT) return;

 

    /** get the SeqFeatPtr ***/

    /** this step is unnecessary in this application.. it's just here */

    /** to show where to get it */

 

   sfp = (SeqFeatPtr) aeosp->the_struct;

 

   /** get the user supplied data **/

 

   mdp = (MydataPtr) aeosp->data;

 

   /*** do the job of counting ****/

 

   mdp->counter++;

 

   /*** that's it *****/

 

   return;

}

The AsnExpOptStruct, aeosp, is not the same as an AsnOption, described earlier. The aeosp‑>dvp is the DataValPtr which would normally be written out on the AsnWrite(). For primitive types it contains the integer, boolean, real number, CharPtr or ByteStorePtr for the data. For structures like SEQUENCE, SET, etc, it contains the value START_STRUCT or END_STRUCT, and the pointer to the C structure will be in aeosp->the_struct, as above. When the same callback is used for different data types, the data type can be found in aeosp->atp for all types. When writing a CHOICE, a key for the CHOICE is found in aeosp->the_choice, and a value appropriate to the CHOICE is found in aeosp->dvp.  What is delivered for a CHOICE type can be problematic, since for a CHOICE itself, nothing but a type is normally written, so it is a judgment call what to supply in dvp. For these types, one should look at the object loader .c file to be certain what will be passed.

Note that for this iterator to work for structures and choices, AsnOpenStruct(), AsnCloseStruct(), and AsnWriteChoice() must be used in the object loaders.

When the stream is closed, the ExpOpt structures are also freed. If a stream is to be reused then an AsnExpOptFree() function is provided to strip ExpOpts off the stream pointer.

The generalized iterator shown here can be used to treat the object loader structures as a random access database with named keys in memory. It is extremely powerful and flexible. Its main drawback is that it must travers the whole structure to find the fields of interest. Since this is normally very fast anyway, this is not a major problem at the moment, although for very large objects it may be.

AsnLib and Object Loaders Provide a Generalized Copy and Compare

Any data of arbitrary complexity can be easily copied or compared using the object loaders. Basically the object loader read and write functions, and a pointer to the object to be copied are passed to a function. The functions are then used first to write the struct as ASN.1, to a file or in memory, and then are used to read it back into a new structure, and then return a pointer to the new structure. The compare is done the same way, except one copy is written, then the other is written and, as part of the second write, compared to the first write (only one copy ever actually exists as an ASN.1 stream). This is a byte by byte compare, so the objects must be completely identical to return TRUE.

extern Pointer AsnIoCopy PROTO((Pointer from, AsnReadFunc readfunc,

                                                       AsnWriteFunc writefunc);

 

extern Pointer AsnIoMemCopy PROTO((Pointer from, AsnReadFunc readfunc,

                                                       AsnWriteFunc writefunc));

 

extern Boolean AsnIoMemComp PROTO((Pointer a, Pointer b,

                                                        AsnWriteFunc writefunc));

 

AsnLib Interface: asn.h

/* asn.h

* ===========================================================================

*

*                            PUBLIC DOMAIN NOTICE                         

*               National Center for Biotechnology Information

*                                                                         

*  This software/database is a "United States Government Work" under the  

*  terms of the United States Copyright Act.  It was written as part of   

*  the author's official duties as a United States Government employee and

*  thus cannot be copyrighted.  This software/database is freely available

*  to the public for use. The National Library of Medicine and the U.S.   

*  Government have not placed any restriction on its use or reproduction. 

*                                                                         

*  Although all reasonable efforts have been taken to ensure the accuracy 

*  and reliability of the software and data, the NLM and the U.S.         

*  Government do not and cannot warrant the performance or results that   

*  may be obtained by using this software or data. The NLM and the U.S.   

*  Government disclaim all warranties, express or implied, including      

*  warranties of performance, merchantability or fitness for any particular

*  purpose.                                                               

*                                                                         

*  Please cite the author in any work or product based on this material.  

*

* ===========================================================================

*

* File Name: asn.h

*

* Author:  James Ostell

*

* Version Creation Date: 1/1/91

*

* $Revision: 1.2 $

*

* File Description:

*   This header the interface to all the routines in the ASN.1 libraries

*     that an application should ever use.  It also includes the necessary

*     typedefs -- however the application programmer is not meant to use

*     the internal structures directly outside of the specified functions,

*     as the internal structures may be changed without notice.

*

* Modifications: 

* --------------------------------------------------------------------------

* Date     Name        Description of modification

* -------  ----------  -----------------------------------------------------

*

*

* ==========================================================================

*/

 

#ifndef _ASNTOOL_

#define _ASNTOOL_

                      /*** depends on NCBI core routines ***/

#ifndef _NCBI_

#include <ncbi.h>

#endif

 

#ifdef __cplusplus

extern "C" {

#endif

 

   /**** ValNode is used for internal representation of values from

   ****  CHOICE, SET OF, SEQ OF and combinations for many cases.

   ****  it is provided in ncbimisc for build object routines ****/

 

/***  The following defines can be used for backward compatibility

#define AsnValue DataVal

#define AsnNode ValNode

***/

/***  In addition, AsnValueNode was changed to AsnValxNode so it would

      not conflict with the AsnValue define above

****/

 

#ifndef START_STRUCT

#define START_STRUCT       411           /* { found */

#define END_STRUCT         412           /* } found */

#endif

 

typedef struct asnvaluenode {

   Int2 valueisa;

   CharPtr name;           /* use for strings and named int */

   Int4 intvalue;             /* use for int and boolean */

   FloatHi realvalue;

   struct asnvaluenode PNTR next;

}  AsnValxNode, PNTR AsnValxNodePtr;

 

   /******** AsnType is a node in the AsnTool parse tree *******/

 

typedef struct asntype {

   Int2 isa;

   CharPtr name;

   Uint1 tagclass;

   Int2 tagnumber;

   Boolean implicit;

   Boolean optional;

   Boolean hasdefault;

   Boolean exported;

   Boolean imported;

   Boolean resolved;

   AsnValxNodePtr defaultvalue;          /* used for default value, range, subtypes */

   struct asntype PNTR type;

   Pointer branch;                       /* used for named ints, enum, set, sequence */

   Int2 tmp;     /* for temporary ->type link to local tree */

   struct asntype PNTR next;

}  AsnType, PNTR AsnTypePtr;

 

typedef struct asnmodule {

   CharPtr modulename;

   CharPtr filename;           /* if module to be loaded from disk */

   AsnTypePtr types;

   AsnTypePtr values;

   struct asnmodule PNTR next;    /* for chain of modules */

   Int2 lasttype;          /* for isa defined types */

   Int2 lastvalue;         /* for isa defined values */

}  AsnModule, PNTR AsnModulePtr;

 

#define ASNIO_BUFSIZE      1024    /* default size of AsnIo.buf */

                                /* AsnIo.type  bit[0] = text? bit[1]=binary?*/

                                /* bit[2]=input? bit[3]=output?           */

#define ASNIO_TEXT  1

#define ASNIO_BIN   2

#define ASNIO_IN    4

#define ASNIO_OUT   8

#define ASNIO_FILE  16

#define ASNIO_CARRIER   32     /* is a pure iterator */

 

#define ASNIO_TEXT_IN      21     /* AsnIo.type */

#define ASNIO_TEXT_OUT     25

#define ASNIO_BIN_IN 22

#define ASNIO_BIN_OUT      26

 

typedef struct pstack {

    AsnTypePtr type;           /* type at this level of stack */

    Int4 len;                  /* length of item for binary decode */

    Boolean resolved;          /* resolution of type for binary decode */

   Boolean tag_indef;                /* indefinate tag length on input? */

} Pstack, PNTR PstackPtr;

 

typedef void (* AsnOptFreeFunc) PROTO ((Pointer));

 

typedef struct asnopt {

   Int2 ao_class;               /* class of option. all negative numbers res.*/

   Int2 type;                /* type within ao_class */

   DataVal data;            /* data used for setting option */

   AsnOptFreeFunc freefunc;  /* function to free data.ptrvalue */

   struct asnopt PNTR next;

} AsnOption, PNTR AsnOptionPtr;

 

typedef struct asnexpoptstruct {

   struct asnio PNTR aip;

   AsnTypePtr atp;

   DataValPtr dvp;

   Int2 the_choice;

   Pointer the_struct;

   Pointer data;

} AsnExpOptStruct, PNTR AsnExpOptStructPtr;

 

typedef void (* AsnExpOptFunc) PROTO ((AsnExpOptStructPtr));

#define NO_CHOICE_SET INT2_MIN     /* for AsnExpOptStruct.the_choice  */

 

typedef struct expopt {

   Int2 numtypes;

   AsnTypePtr PNTR types;             /* the type to check */

   Pointer user_data;           /* user supplied data */

   AsnExpOptFunc user_callback; /* user supplied callback function */

   struct expopt PNTR next;

} AsnExpOpt, PNTR AsnExpOptPtr;

 

typedef void ( *ErrorRetType) PROTO((Int2, CharPtr));

typedef Int2 ( *IoFuncType) PROTO((Pointer, CharPtr, Uint2));

 

typedef struct asnio {

   CharPtr linebuf;

   Int1 type;            /* type- text-in, text-out, bin-in, bin-out */

   Int2 linepos;         /* current offset in linebuf */

   FILE * fp;             /* file to write or read to */

   BytePtr buf;          /* buffer for I/O */

    Int2 bufsize;         /* sizeof this buffer */

   Int2 bytes,           /* bytes of data available in buf */

       offset;           /* current offset of processing in buf */

   Uint1 tagclass;       /* last BER tag-id-len read */

   Int2 tagnumber;

   Boolean constructed;

   Int4 length;          /* length of BER encoded data */

   Boolean tagsaved;     /* TRUE if tag info already here - stops read */

   Int4 used;            /* if tagsaved, bytes used recorded here */

   Int1 tabsize,         /* spaces per tab */

       indent_level,     /* current indent level for print output */

       linelength,      /* max line length on output */

       max_indent,       /* current maximum indent levels for first */

       state;            /* parsing state */

    BoolPtr first;        /* for first element on indented line for printing */

   Int4 linenumber;      /* for reporting errors */

   CharPtr word;           /* current word in linebuf */

   Int2 wordlen,         /* length of word in linebuf */

        token;           /* current parsing token for word */

    PstackPtr typestack;  /* the parsing stack for input and output */

   Int1 type_indent,     /* used like indent_level and max_indent, but for */

       max_type;         /* typestack */

   ErrorRetType error_ret;     /* user error return */

    Pointer iostruct;    /* non-FILE io structure */

    IoFuncType readfunc,      /* read/write functions for sockets */

          writefunc;     /*  open and close MUST be done outside AsnIo */

   Boolean read_id;     /* for checking AsnReadId AsnReadVal alternation */

   CharPtr fname;       /* name of file in use */

   AsnOptionPtr aop;    /* head of options chain */

   AsnExpOptPtr aeop;   /* exploration options chain */

   AsnExpOptStructPtr aeosp;

   Boolean io_failure;  /* set on failed write */

} AsnIo, PNTR AsnIoPtr;

 

typedef struct asniomem {    /* for AsnIo to and from a memory block */

   AsnIoPtr aip;                  /* the AsnIoPtr for this */

   BytePtr buf;                   /* a buffer for the data */

   Uint2 size,             /* size of this buffer (w) or bytes_to_read (r) */

          count;           /* count of bytes read from or written to buffer */

} AsnIoMem, PNTR AsnIoMemPtr;

 

typedef struct asniobs {    /* for AsnIo to and from a memory ByteStore */

   AsnIoPtr aip;                  /* the AsnIoPtr for this */

   ByteStorePtr bsp;        /* byte store for this */

} AsnIoBS, PNTR AsnIoBSPtr;

 

/***** typedefs used often in object loaders **********/

 

typedef Pointer (* AsnReadFunc) PROTO((AsnIoPtr aip, AsnTypePtr atp));

typedef Boolean (* AsnWriteFunc) PROTO((Pointer object, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   prototypes

*     

*****************************************************************************/

/*** asngen.c ****/

 

extern AsnTypePtr AsnReadId PROTO((AsnIoPtr aip, AsnModulePtr amp, AsnTypePtr atp));

extern Int2 AsnReadVal PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr vp));

extern Boolean AsnWrite PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr dvp));

extern Boolean AsnSkipValue PROTO((AsnIoPtr aip, AsnTypePtr atp));

 

extern Boolean AsnOpenStruct PROTO((AsnIoPtr aip, AsnTypePtr atp,

          Pointer the_struct));

extern Boolean AsnCloseStruct PROTO((AsnIoPtr aip, AsnTypePtr atp,

          Pointer the_struct));

extern Boolean AsnWriteChoice PROTO((AsnIoPtr aip, AsnTypePtr atp, Int2 choice,

          DataValPtr the_value));

extern void AsnCheckExpOpt PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr dvp));

extern AsnExpOptPtr AsnExpOptNew PROTO((AsnIoPtr aip, CharPtr path,

          Pointer user_data, AsnExpOptFunc user_func));

extern AsnExpOptPtr AsnExpOptFree PROTO((AsnIoPtr aip, AsnExpOptPtr aeop));

 

extern Int2 AsnGetLevel PROTO((AsnIoPtr aip));

extern void AsnNullValueMsg PROTO((AsnIoPtr aip, AsnTypePtr node));

 

/*** asntypes.c ***/

 

extern void AsnKillValue PROTO((AsnTypePtr atp, DataValPtr dvp));

extern AsnTypePtr PNTR AsnTypePathFind PROTO((AsnModulePtr amp, CharPtr str, Int2Ptr numtypes));

extern AsnTypePtr AsnTypeFind PROTO((AsnModulePtr amp, CharPtr str));

#define AsnFind(x) AsnTypeFind(NULL,x)    /* find type (all) */

extern CharPtr AsnFindPrimName PROTO((AsnTypePtr atp));

extern CharPtr AsnFindBaseName PROTO((AsnTypePtr atp));

extern AsnTypePtr AsnLinkType PROTO((AsnTypePtr type, AsnTypePtr localtype));

extern void AsnUnlinkType PROTO((AsnTypePtr type));

extern CharPtr AsnTypeDumpStack PROTO((CharPtr str, AsnIoPtr aip));

extern Boolean AsnTreeLoad PROTO((char * file, AsnValxNodePtr * avnptr, AsnTypePtr * atptr, AsnModulePtr * ampptr));

#define AsnLoad() AsnTreeLoad(asnfilename, &avn, &at, &amp)   /* simple loader */

extern void AsnModuleLink PROTO((AsnModulePtr amp));

extern CharPtr AsnEnumStr PROTO((CharPtr str, Int2 val));

extern CharPtr AsnEnumTypeStr PROTO((AsnTypePtr atp, Int2 val));

extern AsnModulePtr AsnAllModPtr PROTO((void));

 

/*** asnio.c ****/

 

extern AsnIoPtr AsnIoOpen PROTO((CharPtr file_name, CharPtr mode));

extern AsnIoPtr AsnIoClose PROTO((AsnIoPtr aip));

extern void AsnIoReset PROTO((AsnIoPtr aip));

extern void AsnIoSetErrorMsg PROTO((AsnIoPtr aip, ErrorRetType error_ret));

extern Int4 AsnIoSeek PROTO((AsnIoPtr aip, Int4 pos));

extern Int4 AsnIoTell PROTO((AsnIoPtr aip));

extern void AsnIoFlush PROTO((AsnIoPtr aip));

extern AsnIoPtr AsnIoNew PROTO((Int1 type, FILE * fp, Pointer iostruct, IoFuncType readfunc, IoFuncType writefunc));

extern Boolean AsnIoSetBufsize PROTO((AsnIoPtr aip, Int2 size));

extern AsnOptionPtr AsnIoOptionNew PROTO((AsnIoPtr aip, Int2 ao_class, Int2 type, DataVal av, AsnOptFreeFunc freefunc));

extern void AsnIoOptionFree PROTO((AsnIoPtr aip, Int2 ao_class, Int2 type));

extern Boolean AsnClassTypeMatch PROTO((Int2 ao_class, Int2 type, Int2 this_class, Int2 this_type));

extern AsnOptionPtr AsnIoOptionGet PROTO((AsnIoPtr aip, Int2 ao_class, Int2 type,

                                  AsnOptionPtr last));

extern AsnOptionPtr AsnOptionNew PROTO((AsnOptionPtr PNTR aopp, Int2 ao_class, Int2 type, DataVal av, AsnOptFreeFunc freefunc));

extern void AsnOptionFree PROTO((AsnOptionPtr PNTR aopp, Int2 ao_class, Int2 type));

extern AsnOptionPtr AsnOptionGet PROTO((AsnOptionPtr head, Int2 ao_class, Int2 type,

                                  AsnOptionPtr last));

 

   /*** read and write to memory buffer ***/

extern AsnIoMemPtr AsnIoMemOpen PROTO((CharPtr mode, BytePtr buf, Uint2 size));

extern AsnIoMemPtr AsnIoMemClose PROTO((AsnIoMemPtr aimp));

extern Boolean AsnIoMemReset PROTO((AsnIoMemPtr aimp, Uint2 bytes_to_read));

extern Int2 AsnIoMemRead PROTO((Pointer, CharPtr, Uint2));

extern Int2 AsnIoMemWrite PROTO((Pointer, CharPtr, Uint2));

 

   /*** read and write to a ByteStore in memory ***/

extern AsnIoBSPtr AsnIoBSOpen PROTO((CharPtr mode, ByteStorePtr bsp));

extern AsnIoBSPtr AsnIoBSClose PROTO((AsnIoBSPtr aibp));

extern Int2 AsnIoBSRead PROTO((Pointer, CharPtr, Uint2));

extern Int2 AsnIoBSWrite PROTO((Pointer, CharPtr, Uint2));

 

  /** Copy and Compare functions ***/

extern Pointer AsnIoCopy PROTO((Pointer from, AsnReadFunc readfunc, AsnWriteFunc writefunc));

extern Pointer AsnIoMemCopy PROTO((Pointer from, AsnReadFunc readfunc, AsnWriteFunc writefunc));

extern Boolean AsnIoMemComp PROTO((Pointer a, Pointer b, AsnWriteFunc writefunc));

 

#define AsnIoNullOpen() AsnIoNew((ASNIO_OUT | ASNIO_TEXT | ASNIO_CARRIER), NULL, NULL, NULL, NULL)

 

/*** asndebin.c ***/

 

extern AsnTypePtr AsnBinReadId PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Int2 AsnBinReadVal PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr vp));

 

/*** asnenbin.c ***/

 

extern Boolean AsnBinWrite PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr dvp));

         /** expert use only ***/

extern void AsnEnBinBytes PROTO((Pointer ptr, Uint4 len, AsnIoPtr aip));

 

/*** asnlex.c ***/

 

extern AsnTypePtr AsnTxtReadId PROTO((AsnIoPtr aip, AsnModulePtr amp, AsnTypePtr atp));

extern Int2 AsnTxtReadVal PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr vp));

 

/*** asnprint.c ***/

 

extern Boolean AsnTxtWrite PROTO((AsnIoPtr aip, AsnTypePtr atp, DataValPtr dvp));

 

/*** asnlext.c ***/

 

extern AsnModulePtr AsnLoadModules PROTO((AsnIoPtr aip));

 

/******** temporary defines for older code *************/

 

#define AsnStartStruct(x,y) AsnOpenStruct(x, y, NULL)

#define AsnEndStruct(x,y) AsnCloseStruct(x, y, NULL)

 

/***** AsnOption ao_class values - do not reuse ***************/

/***** all positive numbers > 0 are available to non-NCBI applications ***/

 

#define OP_ANY          0

#define OP_TOGENBNK    -1

#define OP_BB2ASN      -2

#define OP_NCBIOBJSSET -3

#define OP_NCBIOBJSEQ  -4

#define OP_GET_MUID    -5

 

 

#ifdef __cplusplus

}

#endif

 

#endif



General Use Objects


Introduction
Large Text Blocks: StringStore
The Date
Identifying Things: Object-id
Identifying Things: Dbtag
Identifying People: Person-id
Expressing Uncertainty with Fuzzy Integers: Int-fuzz
Creating Your Own Objects: User-object
ASN.1 Specification: general.asn
C Structures and Functions: objgen.h


 Introduction

This section presents the data objects defined in general.asn and objgen.[ch].  They are a miscellaneous collection of generally useful types.

Large Text Blocks: StringStore

A StringStore is defined as a VisibleString for ASN.1 encoding.  This type is used to hold very long strings.  It is simply a hint to the AsnLib functions to store the incoming data in a ByteStore (see CoreLib chapter) rather than an array to avoid overrunning allocation limits of some computers.  OCTET STRINGs (a sequence of opaque bytes) are always kept in ByteStore structures since the length of the object must be stored as well (no terminating '\0' is possible).  ByteStores have the advantage of segmenting the long strings, which for nucleic acid data can get very long.  The ByteStore will allow us to add data buffering to disk for these large objects as it becomes necessary even on large computers.

The Date

ASN.1 has primitive types for recording dates but which require the time in seconds as well.  For scientific and bibliographic data, it is common that only the date, or even just a part of the date (e.g. month and year) are available.  Rather than use artificial zero values for the more precise ASN.1 form, we have created a specialized Date type.  Date is a CHOICE of a simple, unparsed string or a structured Date-std.  The string form is a fall-back for when the input data is so poorly structured that it is impossible to reliably parse the date fields from it.  It should only be used as a last resort to accommodate old data, as it is impossible to compute or index on.

When possible, the "std" form of the Date should be used.  In this case year is an integer (e.g. 1992), month is an integer from 1-12 (where January is 1), and day is an integer from 1-31.  A string called "season" can be used, particularly for bibliographic citations (e.g. the "spring" issue).  When a range of months is given for an issue (e.g. "June‑July") it cannot be represented directly.  However, one would like to be able to index on integer months but still not lose the range.  This is accomplished by putting 6 in the "month" slot and "‑July" in the "season" slot.  The DatePrint() function will put them back together for display, but the issue can still be indexed by month.  Year is the only required field in a Date-std.

The "C" structure used for Date can accommodate both the representation of the CHOICE itself (which kind of Date is this?) and the data from either CHOICE.  It has a four byte array and a CharPtr.  The byte[0] indicates what kind of Date it is.  If a "str" type, then the CharPtr points to the string and the other three bytes in the array have no meaning.  If a "std" type, then the byte[1] is the year (minus 1900 to save space - the object loaders will add the 1900 back when encoding into ASN.1), byte[2] is the month (or 0 if not given), and byte[3] is the day (or 0 if not given).  If the CharPtr is NULL, then the season is not given.

The object loaders contain a number of handy functions for working with Dates in addition to the usual New(), Free(), AsnRead() and AsnWrite() functions. DateWrite() will fill a Date.std with the function arguments. DateRead() will fill pointer arguments with the values from a Date. DateCurr() will create and return a Date.std filled with the current date by accessing the computer system.  DateDup() will create a copy of a Date. DatePrint() will format a Date into a display format into a buffer supplied by the caller.  This buffer should normally be at least 30 bytes long.  The format is e.g. "Jun 30, 1992".

DateMatch(a, b, all) will return 0 if Date a is the same as Date b, 1 if b is after a, -1 if b is before a. It will return a 2 or -2 (for sorting) if they are different Date types (str and std) that could not be compared. If all is equal to TRUE, then all fields that are set in one Date must be set and must match in the other Date. If all is equal to FALSE, then only the fields set in both are matched. Note that this function can only measure if one date is before another chronologically if both are Date-std types. The string Date types can only be compared lexically (like strcmp()).

Identifying Things: Object-id

An Object-id is a simple structure used to identify a data object.  It is just a CHOICE of an INTEGER or a VisibleString.  It must always be used within some defining context (e.g. see Dbtag below) in order to have some global meaning.  It allows flexibility in a host system's preference for identifying things by integers or strings.

The ObjectId "C" structure has a 4 byte integer slot and a CharPtr slot.  If the CharPtr is NULL, then the integer value is the identifier and the type is "int".  If the CharPtr is not NULL, then the Object-id is type "str" and the CharPtr is considered to point at the identifier.

There is an ObjectIdDup() function to make a copy of an ObjectId and an ObjectIdMatch() function which returns TRUE if two ObjectIds are identical, FALSE if they are not.

Identifying Things: Dbtag

A Dbtag is an Object-id within the context of a database.  The database is just defined by a VisibleString.  The strings identifying the database are not centrally controlled, so it is possible that a conflict could occur.  If there is a proliferation of Dbtags, then a registry might be considered at NCBI.  Dbtags provide a simple, general way for small database providers to supply their own internal identifiers in a way which will, usually, be globally unique as well, yet requires no official sanction.  So, for example, identifiers for features on sequences are not widely available at the present time.  However, the Eukaryotic Promotor Database (EPD) can be provided as a set of features on sequences.  The internal key to each EPD entry can be propagated as the Feature-id by using a Dbtag where "EPD" is the "db" field and an integer is used in the Object-id, which is the same integer identifying the entry in the normal EPD release.

As for ObjectIds, there are DbtagMatch() and DbtagDup() functions in the object loaders.

Identifying People: Person-id

Person-id provides an extremely flexible way to identify people.  There are four CHOICES from very explicit to completely unstructured.  When one is building a database, one should select the most structured form possible.  However, when one is processing data from other sources, one can only pick the most structured form possible, given the input data.

The first Person-id CHOICE is a Dbtag.  It would allow people to be identified by some formal registry.  For example, in the USA, it might be possible to identify people by Social Security Number.  Theoretically, one could then maintain a link to a person in database, even if they changed their name.  Dbtag would allow other registries, such as professional societies, to be used as well. Frankly, this may be wishful thinking and possibly even socially inadvisable, though from a database standpoint, it would be very useful to have some stable identifier for people.

A Name-std Choice is the next most explicit form.  It allows a structured, fielded name, making indexing by last name, but disambiguation (of say, "Jones") by first name possible.  This is the best choice when the data is available and its use should be encouraged by those building new databases wherever reasonable.

The last two choices are string types.  MEDLINE stores names in strings in a structured way (e.g. Jones JM).  This means one can usually, but not always, parse out last names and can generally build indexes on the assumption that the last name is first.  Thus, it is worth distinguishing this case from the pure string form, the last CHOICE.  In a pure string, there are no guarantees of any kind made about the structure of the name.  It could be last name first, first name first, comma after last name, periods between initials, etc.  The string form should be the CHOICE of last resort.

In the "C" structure, the first element indicates the type of the Person-id.  The generic Pointer then must be cast to the correct type given that knowledge.  So, for a Person-id.dbtag the Pointer is a DbtagPtr.  For Person-id.name it is a NameStdPtr.  For Person-id.ml or Person-id.str it is a CharPtr.

Expressing Uncertainty with Fuzzy Integers: Int-fuzz

Lengths of biological sequences and locations on them are expressed with integers.  However, sometimes it is desirable to be able to indicate some uncertainty about that length or location.  Unfortunately, most software cannot make good use of such uncertainties, though in most cases this is fine.  In order to provide both a simple, single integer view, as well as a more complex fuzzy view when appropriate, we have adopted the following strategy.  In the NCBI specifications, all lengths and locations are always given by simple integers.  If information about fuzziness is appropriate, then an Int-fuzz is ADDED to the data.  In this case, the simple integer can be considered a "best guess" of the length or location.  Thus simple software can ignore fuzziness, while it is not lost to more sophisticated uses.

Fuzziness can take a variety of forms.  It can be plus or minus some fixed value.  It can be somewhere in a range of values.  It can be plus or minus a percentage of the best guess value.  It may also be certain boundary conditions (greater than the value, less than the value) or refer to the bond BETWEEN residues of the biological sequence (bond to the right of this residue, bond to the left of that residue).

Creating Your Own Objects: User-object

One of the strengths of ASN.1 is that it requires a formal specification of data down to very detailed levels.  This enforces clear definitions of data which greatly facilitates exchange of information in useful ways between different databases, software tools, and scientific enterprises.  The problem with this approach is that it makes it very difficult for end users to add their own objects to the specification or enhance objects already in the specification.  Certainly custom modules can be added to accommodate specific groups needs, but the data from such custom modules cannot be exchanged or passed through tools which adhere only to the common specification.

We have defined an object called a User-object, which can represent any class of simple, structured, or tabular data in a completely structured way, but which can be defined in any way that meets a user's needs. The User-object itself has a "class" tag which is a string used like the "db" string in Dbtag, to set the context in which this User-object is meaningful.  The "class" strings are not centrally controlled, so again it is possible to have a conflict, but unlikely unless activity in this area becomes very great.  Within a "class" one can define an object "type" by either a string or an integer.  Thus any particular endeavor can define a wide variety of different types for their own use.  The combination of "class" and "type" identifies the object to databases and software that may understand and make use this particular User-object's structure and properties.  Yet, the generic definition means software that does not understand the purpose or use of any User-object can still parse it, pass it though, or even print it out for a user to peruse.

The attributes of the User-object are contained in one or more User-fields.  Each User-field has a field label, which is either a string or an integer.  It may contain any kind of data, strings, real numbers, integers, arrays of anything, or even sub-fields or complete sub-objects.  When arrays and repeating fields are supplied, the optional "num" attribute of the User-field is used to tell software how many elements to prepare to receive.  Virtually any structured data type from the simplest to the most complex can be built up from these elements.

The User-object is provided in a number of places in the public ASN.1 specifications to allow users to added their own structured features to Feature-tables or their own custom extensions to existing features.  This allows new ideas to be tried out publicly, and allows software tools to be written to accommodate them, without requiring consensus among scientists or constant revisions to specifications.  Those new ideas which time and experience indicate have become important concepts in molecular biology can be "graduated" to real ASN.1 specifications in the public scheme.  A large body of structured data would presumably already exist in User-objects of this type, and these could all be back fitted into the new specified type, allowing data to "catch up" to the present specification.  Those User-objects which do not turn out to be generally useful or important remain as harmless historical artifacts.  User-objects could also be used for custom software to attach data only required for use by a particular tool to an existing standard object without harming it for use by standard tools.

ASN.1 Specification: general.asn

--$Revision: 1.2 $

--**********************************************************************

--

--  NCBI General Data elements

--  by James Ostell, 1990

--

--**********************************************************************

 

NCBI-General DEFINITIONS ::=

BEGIN

 

EXPORTS Date, Person-id, Object-id, Dbtag, Int-fuzz, User-object;

 

-- StringStore is really a VisibleString.  It is used to define very

--   long strings which may need to be stored by the receiving program

--   in special structures, such as a ByteStore, but it's just a hint.

--   AsnTool stores StringStores in ByteStore structures.

-- OCTET STRINGs are also stored in ByteStores by AsnTool

--

-- typedef struct bsunit {       /* for building multiline strings */

   -- Nlm_Handle str;            /* the string piece */

   -- Nlm_Int2 len_avail,

       -- len;

   -- struct bsunit PNTR next; }       /* the next one */

-- Nlm_BSUnit, PNTR Nlm_BSUnitPtr;

--

-- typedef struct bytestore {

   -- Nlm_Int4 seekptr,       /* current position */

   -- totlen,             /* total stored data length in bytes */

-- chain_offset;     /* offset in ByteStore of first byte in curchain */

   -- Nlm_BSUnitPtr chain,       /* chain of elements */

      -- curchain;           /* the BSUnit containing seekptr */

-- } Nlm_ByteStore, PNTR Nlm_ByteStorePtr;

--

-- AsnTool incorporates this as a primitive type, so the definition

--   is here just for completness

--

--  StringStore ::= [APPLICATION 1] IMPLICIT OCTET STRING

--

 

-- Date is used to replace the (overly complex) UTCTtime, GeneralizedTime

--  of ASN.1

--  It stores only a date

--

 

Date ::= CHOICE {

    str VisibleString ,        -- for those unparsed dates

    std Date-std }             -- use this if you can

 

Date-std ::= SEQUENCE {        -- NOTE: this is NOT a unix tm struct

    year INTEGER ,             -- full year (including 1900)

    month INTEGER OPTIONAL ,   -- month (1-12)

    day INTEGER OPTIONAL ,     -- day of month (1-31)

    season VisibleString OPTIONAL }  -- for "spring", "may-june", etc

 

-- Dbtag is generalized for tagging

-- eg. { "Social Security", str "023-79-8841" }

-- or  { "member", id 8882224 }

 

Dbtag ::= SEQUENCE {

    db VisibleString ,          -- name of database or system

    tag Object-id }         -- appropriate tag

 

-- Object-id can tag or name anything

--

 

Object-id ::= CHOICE {

    id INTEGER ,

    str VisibleString }

 

-- Person-id is to define a std element for people

--

 

Person-id ::= CHOICE {

    dbtag Dbtag ,               -- any defined database tag

    name Name-std ,             -- structured name

    ml VisibleString ,          -- MEDLINE name (semi-structured)

                                --    eg. "Jones RM"

    str VisibleString }         -- unstructured name

 

Name-std ::= SEQUENCE { -- Structured names

    last VisibleString ,

    first VisibleString OPTIONAL ,

    middle VisibleString OPTIONAL ,

    full VisibleString OPTIONAL ,    -- full name eg. "J. John Poop, Esq"

    initials VisibleString OPTIONAL,  -- first + middle initials

    suffix VisibleString OPTIONAL ,   -- Jr, Sr, III

    title VisibleString OPTIONAL }    -- Dr., Sister, etc

 

--**** Int-fuzz **********************************************

--*

--*   uncertainties in integer values

 

Int-fuzz ::= CHOICE {

    p-m INTEGER ,                    -- plus or minus fixed amount

    range SEQUENCE {                 -- max to min

        max INTEGER ,

        min INTEGER } ,

    pct INTEGER ,                    -- % plus or minus (x10) 0-1000

    lim ENUMERATED {                 -- some limit value

        unk (0) ,                    -- unknown

        gt (1) ,                     -- greater than

        lt (2) ,                     -- less than

        tr (3) ,                     -- space to right of position

        tl (4) ,                     -- space to left of position

        other (255) } }              -- something else

 

 

--**** User-object **********************************************

--*

--*   a general object for a user defined structured data item

--*    used by Seq-feat and Seq-descr

 

User-object ::= SEQUENCE {

    class VisibleString OPTIONAL ,   -- endeavor which designed this object

    type Object-id ,                 -- type of object within class

    data SEQUENCE OF User-field }    -- the object itself

 

User-field ::= SEQUENCE {

    label Object-id ,                -- field label

    num INTEGER OPTIONAL ,           -- required for strs, ints, reals, oss

    data CHOICE {                    -- field contents

        str VisibleString ,

        int INTEGER ,

        real REAL ,

        bool BOOLEAN ,

        os OCTET STRING ,

        object User-object ,         -- for using other definitions

        strs SEQUENCE OF VisibleString ,

        ints SEQUENCE OF INTEGER ,

        reals SEQUENCE OF REAL ,

        oss SEQUENCE OF OCTET STRING ,

        fields SEQUENCE OF User-field ,

        objects SEQUENCE OF User-object } }

 

 

 

END

C Structures and Functions: objgen.h

/*  objgen.h

* ===========================================================================

*

*                            PUBLIC DOMAIN NOTICE                         

*               National Center for Biotechnology Information

*                                                                         

*  This software/database is a "United States Government Work" under the  

*  terms of the United States Copyright Act.  It was written as part of   

*  the author's official duties as a United States Government employee and

*  thus cannot be copyrighted.  This software/database is freely available

*  to the public for use. The National Library of Medicine and the U.S.   

*  Government have not placed any restriction on its use or reproduction. 

*                                                                          

*  Although all reasonable efforts have been taken to ensure the accuracy 

*  and reliability of the software and data, the NLM and the U.S.         

*  Government do not and cannot warrant the performance or results that   

*  may be obtained by using this software or data. The NLM and the U.S.   

*  Government disclaim all warranties, express or implied, including      

*  warranties of performance, merchantability or fitness for any particular

*  purpose.                                                                

*                                                                         

*  Please cite the author in any work or product based on this material.  

*

* ===========================================================================

*

* File Name:  objgen.h

*

* Author:  James Ostell

*  

* Version Creation Date: 1/1/91

*

* $Revision: 1.2 $

*

* File Description:  Object manager interface for module NCBI-General

*

* Modifications: 

* --------------------------------------------------------------------------

* Date    Name        Description of modification

* -------  ----------  -----------------------------------------------------

*

*

* ==========================================================================

*/

 

#ifndef _NCBI_General_

#define _NCBI_General_

 

#ifndef _ASNTOOL_

#include <asn.h>

#endif

 

#ifdef __cplusplus

extern "C" {

#endif

 

/*****************************************************************************

*

*   loader

*

*****************************************************************************/

extern Boolean GeneralAsnLoad PROTO((void));

 

/*****************************************************************************

*

*   internal structures for NCBI-General objects

*

*****************************************************************************/

 

/*****************************************************************************

*

*   Date, Date-std share the same structure

*      any data[2] or data[3] values = 0 means not set or not present

*   data [0] - CHOICE of date ,0=str, 1=std

*        [1] - year (- 1900)

*        [2] - month (1-12)  optional

*       [3] - day (1-31)    optional

*

*****************************************************************************/

 

 

typedef struct date {

   Uint1 data[4];      /* see box above */

   CharPtr str;            /* str or season or NULL */

} NCBI_Date, PNTR NCBI_DatePtr;

#define DatePtr NCBI_DatePtr

 

NCBI_DatePtr DateNew PROTO((void));

NCBI_DatePtr DateFree PROTO((NCBI_DatePtr dp));

Boolean DateWrite PROTO((NCBI_DatePtr dp, Int2 year, Int2 month, Int2 day, CharPtr season));

Boolean DateRead PROTO((NCBI_DatePtr dp, Int2Ptr year, Int2Ptr month, Int2Ptr day, CharPtr season));

Boolean DatePrint PROTO((NCBI_DatePtr dp, CharPtr buf));

NCBI_DatePtr DateCurr PROTO((void));

NCBI_DatePtr DateDup PROTO((NCBI_DatePtr dp));

Boolean DateAsnWrite PROTO((NCBI_DatePtr dp, AsnIoPtr aip, AsnTypePtr atp));

NCBI_DatePtr DateAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

Int2 DateMatch PROTO((DatePtr a, DatePtr b, Boolean all));

 

/*****************************************************************************

*

*   Object-id stuff

*

*****************************************************************************/

typedef struct objid {

   Int4 id;

   CharPtr str;

} ObjectId, PNTR ObjectIdPtr;

 

extern ObjectIdPtr ObjectIdNew PROTO((void));

extern ObjectIdPtr ObjectIdFree PROTO(( ObjectIdPtr oid));

extern ObjectIdPtr ObjectIdAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean ObjectIdAsnWrite PROTO((ObjectIdPtr oid, AsnIoPtr aip, AsnTypePtr atp));

extern Boolean ObjectIdMatch PROTO((ObjectIdPtr a, ObjectIdPtr b));

extern ObjectIdPtr ObjectIdDup PROTO((ObjectIdPtr oldid));

 

/*****************************************************************************

*

*   DBtag stuff

*

*****************************************************************************/

typedef struct dbtag {

   CharPtr db;

   ObjectIdPtr tag;

} Dbtag, PNTR DbtagPtr;

 

extern DbtagPtr DbtagNew PROTO((void));

extern DbtagPtr DbtagFree PROTO(( DbtagPtr dbt));

extern DbtagPtr DbtagAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean DbtagAsnWrite PROTO((DbtagPtr dbt, AsnIoPtr aip, AsnTypePtr atp));

extern Boolean DbtagMatch PROTO((DbtagPtr a, DbtagPtr b));

extern DbtagPtr DbtagDup PROTO((DbtagPtr oldtag));

 

/*****************************************************************************

*

*   Name-std

*   names[0] = last

*        [1] = first

*        [2] = middle

*        [3] = full

*        [4] = initials

*        [5] = suffix

*        [6] = title

*

*****************************************************************************/

typedef struct namestd {

   CharPtr names[7];

} NameStd, PNTR NameStdPtr;

 

extern NameStdPtr NameStdNew PROTO((void));

extern NameStdPtr NameStdFree PROTO(( NameStdPtr nsp));

extern NameStdPtr NameStdAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean NameStdAsnWrite PROTO((NameStdPtr nsp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Person-id

*     choice = 0 = not set

*              1 = dbtag

*              2 = name

*              3 = ml

*              4 = str

*

*****************************************************************************/

typedef struct personid {

   Uint1 choice;         /* which CHOICE, see above */

   Pointer data;         /* points to appropriate data structure */

} PersonId, PNTR PersonIdPtr;

 

extern PersonIdPtr PersonIdNew PROTO((void));

extern PersonIdPtr PersonIdFree PROTO(( PersonIdPtr pid));

extern PersonIdPtr PersonIdAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean PersonIdAsnWrite PROTO((PersonIdPtr pid, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Int-fuzz

*

*****************************************************************************/

typedef struct intfuzz {

   Uint1 choice;       /* 1=p-m, 2=range, 3=pct, 4=lim */

   Int4 a, b;          /* a=p-m,max,pct,orlim, b=min */

} IntFuzz, PNTR IntFuzzPtr;

 

extern IntFuzzPtr IntFuzzNew PROTO((void));

extern IntFuzzPtr IntFuzzFree PROTO(( IntFuzzPtr ifp));

extern IntFuzzPtr IntFuzzAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean IntFuzzAsnWrite PROTO((IntFuzzPtr ifp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   User-field

*      data is an DataVal where:

*    choice    asn1              data. =

        1 = str VisibleString ,  ptrvalue = CharPtr

        2 = int INTEGER ,        intvalue

        3 = real REAL ,          realvalue

        4 = bool BOOLEAN ,       boolvalue

        5 = os OCTET STRING ,    ptrvalue = ByteStorePtr

        6 = object User-object ,   ptrvalue = UserObjectPtr

        7 = strs SEQUENCE OF VisibleString ,  ptrvalue = CharPtr PNTR

        8 = ints SEQUENCE OF INTEGER ,        ptrvalue = Int4Ptr

        9 = reals SEQUENCE OF REAL ,          ptrvalue = FloatHiPtr

        10 = oss SEQUENCE OF OCTET STRING ,   ptrvalue = ByteStorePtr PNTR

        11 = fields SEQUENCE OF User-field ,  ptrvalue = UserFieldPtr

        12 = objects SEQUENCE OF User-object } }  ptrvalue = UserObjectPtr

 

*   User-object

*

*****************************************************************************/

typedef struct userfield {

    ObjectIdPtr label;

    Int4 num;

    Uint1 choice;

    DataVal data;

    struct userfield PNTR next;

} UserField, PNTR UserFieldPtr;

 

extern UserFieldPtr UserFieldNew PROTO((void));

extern UserFieldPtr UserFieldFree PROTO(( UserFieldPtr ufp));

extern UserFieldPtr UserFieldAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean UserFieldAsnWrite PROTO((UserFieldPtr ufp, AsnIoPtr aip, AsnTypePtr atp));

 

typedef struct userobj {

    CharPtr _class;

    ObjectIdPtr type;

    UserFieldPtr data;

    struct userobj PNTR next;   /* for SEQUENCE OF User-object */

} UserObject, PNTR UserObjectPtr;

 

extern UserObjectPtr UserObjectNew PROTO((void));

extern UserObjectPtr UserObjectFree PROTO(( UserObjectPtr uop));

extern UserObjectPtr UserObjectAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean UserObjectAsnWrite PROTO((UserObjectPtr uop, AsnIoPtr aip, AsnTypePtr atp));

 

#ifdef __cplusplus

}

#endif

 

#endif

 



Bibliographic References


Introduction
Citation Components: Affiliation
Citation Components: Authors
Citation Components: Imprint
Citation Components: Title
Citing an Article
Citing a Journal
Citing a Book
Citing a Proceedings
Citing a Letter, Manuscript, or Thesis
Citing Directly Submitted Data
Citing a Patent
Identifying a Patent
Citing an Article or Book which is In Press
Special Cases: Unpublished, Unparsed, or Unusual
Accommodating Any Publication Type
Grouping Different Forms of Citation for a Single Work
Sets of Citations
Comparing Citations
ASN.1 Specification: biblio.asn
C Structures and Functions: objbibli.h
ASN.1 Specification: pub.asn
C Structures and Functions: objpub.h


 Introduction

The published literature is an essential component of any scientific endeavor, not just in molecular biology.  The bibliographic component of the specification and the tools which go with it may find wide use then, permitting reuse of software and databases in many contexts.  In addition, the fact that bibliographic citations appear in data from many sources, makes this data extremely valuable in linking data items from different databases to each other (i.e. indirectly through a shared literature citation) to build integrated views of complex data.  For this reason, it is also important that database builders ensure that their literature component contain sufficient information to permit this mapping.  By conforming to the specification below one can be assured that this will be the case.

Much of the following bibliographic specification was derived from the components recommended in the American National Standard for Bibliographic References (ANSI Z39.29-1977), and in interviews with professional librarians at the National Library of Medicine.  The recommendations were then relaxed somewhat (by making certain fields OPTIONAL) to accommodate the less complete citation information available in current biomedical databases.  Thus, although a field may be OPTIONAL, a database builder should still attempt to fill it, if it can reasonably be done.

In this chapter we also present a specification for the base class Pub, publications of any sort and collections of publications. The MEDLINE specification has enough unique components that it is discussed separately in another chapter.

Citation Components: Affiliation

Affiliation is effectively the institutional affiliation of an author.  Since it has the same fields needed to cite a publisher (of a book) it is reused in that context as well, although in that case it is not precisely an "affiliation".  Affil is a CHOICE of two forms, a structured form which is preferred, or an unstructured string when that is all that is available.

The structured form has a number of fields taken from the ANSI guidelines.  "affil" is institutional affiliation, such as "Harvard University".  "div" is division within institution, such as "Department of Molecular Biology". "city" is obvious. "sub" is a subdivision of a country.  In the United States, this would be the state.  "country" is obvious. "street" has been added to the specification (it is not included in ANSI) so that it is possible to produce a valid mailing address.

Citation Components: Authors

Auth-list is the list of authors for the citation. It is a SEQUENCE, not a SET, since the order of author names matters.  The names can be unstructured strings (the least desirable), semi-structured strings following the MEDLINE rules (e.g. "Jones JM"), or fully structured Authors (most desirable). An Affil can be associated with the whole list (typical of a scientific article). A more detailed discussion on the use of different types of names can be found in the "Identifying People" section of the "General Use Objects" chapter.

If fully structured Authors are used, each Author can have an individual Affil.  The Author uses Person-id (defined in general.asn) which can be an unstructured string or MEDLINE string, as above, or a fielded name with the components broken out separately.  The Author form also allows specification of the role of individual authors in producing the citation.  The primary author(s) does not mean the "first" author, but rather that this author had a role in the original writing or experimental work.  A secondary author is a reviewer or editor of the article.  It is rare in a scientific work that a secondary author is ever mentioned by name.  Authors may play different roles in the work, compiling, editing, translating.  Again, in a scientific work, the authors mentioned did none of these things, but were involved in the actual writing of the paper, although it would not be unusual anymore for one author to be the patent assignee.  For scientific work, then, the main advantages of using the Author form is the use of fielded names and of individual Affils.  For a book, being able to indicate the editors vs. the authors is useful also.

Citation Components: Imprint

Imprint provides information about the physical form in which the citation appeared, such as what volume and issue of a journal it was in. For the "date" a structured Date is preferred. While "volume", "issue", and "pages" are commonly integers, there are many cases where they are not pure integers (e.g. pages xvi-xvii or issue 10A).  Pages is given as a single string to simplify input from different sources.  The convention is first page (hyphen) last page, or just page if it is on a single page.  "section" may be relevant to a book or proceedings.  "pub" is an Affil used to give the publisher of a book.  The Affil.affil field is used to give the name of the publisher. "cprt" is the copyright date for a book. "part-sup" is for part or supplement and is not part of ANSI, but is used by MEDLINE. "language" is for the original language of the publication, which is also used by MEDLINE, but is not part of the ANSI standard. "prepub" is not part of the ANSI standard, but was added by NCBI to accommodate citations for as yet unpublished papers that can accompany data directly submitted by authors to the database.

Citation Components: Title

A published work may have a number of Titles, each playing a particular role in specifying the work.  There is the title of a paper, the title of a book it appears in, or the title of the journal, in which case it may come from a controlled list of serials.  There may also be an original title and a translated title.  For these reasons, Title is a defined entity rather than just a string, to allow the roles to be specified explicitly. Certain types of Title are legal for an Article, but not for a Journal or a Book.  Rather than make three overlapping definitions, one for Article Titles, one for Journal Titles, and one for Book Titles, we have made one Title type and just indicated in the comments of the specification whether a particular form of Title is legal for an Article, Journal, or Book. Title is a SET OF because a work may have more than one title (e.g. an original and a translated title, or an ISO journal title abbreviation and an ISSN).

Title can be of a number of types. "name" is the full title of an article, or the full name of a book or journal. "tsub" is a subordinate title (e.g. "Hemoglobin Binds Oxygen" might be a primary title, while "Heme Groups in Biology: Part II" might be a subordinate title). "trans" is the translated title.  So for an English language database like MEDLINE which contains an article originally published in French, the French title is "name" and the English version of it is "trans".

"jta" is a journal title abbreviation.  It is only valid for a journal name, obviously.  "jta" does not specify what kind of abbreviation it is, so it is the least useful of the journal designations available and should only be used as a last resort. "iso-jta" is an International Standards Organization (ISO) journal title abbreviation.  This is the preferred form.  A list of valid iso-jta's is available from NCBI or the National Library of Medicine. "ml-jta" is a MEDLINE journal title abbreviation.  MEDLINE pre-dates the ISO effort, so it does not use iso-jta's. "coden" is a six letter code for journals which is used by a number of groups, particularly in Europe.  "issn" is a code used by publishers to identify journals.  To facilitate the use of controlled vocabularies for journal titles, NCBI maintains a file of mappings between "name", "iso-jta", "ml-jta", "coden", and "issn" where it is possible, and this file is available upon request.

"abr" is strictly the abbreviated title of a book. "isbn" is similar to "issn" in that it is a publishers abbreviation for a book.  "isbn" is very useful, but one must be careful since it is used by publishers to list books, and to a publisher a hard cover book is different from a paperback (and get different "isbn"s) even if they are the same title.

Citing an Article

An article always occurs within some other published medium.  It can be an article in a journal or a chapter or section in a book or proceedings.  Thus there are two components to an article citation; a citation for the work it was published in and a citation for the article within that work. Cit-art.title is the Title of the article and Cit-art.authors are the authors of the article. The "from" field is used to indicate the medium the article was published in, and reuses the standard definitions for citing a journal, book, or proceedings.

In the C structure, CitArt.from gives the type of medium published in, and CitArt.fromptr must be cast appropriately to CitJourPtr or CitBookPtr (proceedings uses the same structure as book).

Citing a Journal

Cit-jour is used to cite an issue of a journal, not an article within a journal (see Cit-art, above). Cit-jour.title is the title of the journal, and Cit-jour.imp gives the date, volume, issue of the journal.  Cit-jour.imp also gives the pages of an article within the issue when used as part of a Cit-art.  This is not the purest possible split between article and journal, book, or proceedings, but does have the practical advantage of putting all such physical medium information together in a single common data structure.  A controlled list of journal titles is maintained by NCBI, and database builders are encouraged to use this list to facilitate exchange and linking of data between databases.

Citing a Book

Cit-book is used to cite a whole book, not an article within a book (see Cit-art, above). Cit-book.title is the title of this particular book. Cit-book.coll is used if the book if part of a collection, or muti-volume set (e.g. "The Complete Works of Charles Darwin"). Cit-book.authors is for the authors or editors of the book itself (not necessarily of any particular chapter). Cit-book.imp contains the publication information about the book.  As with a Cit-art, if the Cit-book is being used to cite a chapter in a book, the pages in given in Cit-book.imp.

In the C structure, CitBook is used for Cit-book, Cit-proc, and Cit-let, since they have most fields in common. If CitBook.othertype is 0, it is just a Cit-book.

Citing a Proceedings

A Proceedings is a book published as a result or byproduct of a meeting.  As such it contains all the same fields as a Cit-book and an additional block of information describing the meeting.  These extra fields are the meeting number (as a string to accommodate things like "10A"), the date the meeting occurred, and an OPTIONAL Affil to record the place of the meeting.  The name of the organization or meeting is normally the book title.  Don't be confused by things like the Proceedings of the National Academy of Sciences, USA, which is really a journal.

In the C structure, a CitBook is used, with CitBook.othertype set to 1.  CitBook.otherdata contains a ValNodePtr.  The proceedings can have up to 3 ValNodes where the ValNode.choice indicates the component of the Meeting information, and ValNode.data.ptrvalue contains a pointer to the appropriate data as below:

*     choice       ASN.1 field      Pointer type

*       1          number           CharPtr

*       2          date             DatePtr

*       3          place            AffilPtr

There are separate CitProcAsnRead() and CitProcAsnWrite() functions.  A proceedings reuses the parent class CitBookNew() and CitBookFree() functions.

Citing a Letter, Manuscript, or Thesis

A letter, manuscript, or a thesis share most components and so are grouped together under Cit‑let. They all require most of the attributes of a book, and thus Cit‑let incorporates the Cit‑book structure.  Unlike a normal book, they will not have a copyright date.  A letter or manuscript will not have a publisher, although a thesis may.  In addition, a manuscript may have a manuscript identifier (e.g. "Technical Report X1134").­

The CitBook C structure is reused for Cit‑let.  The CitBook.othertype is 2.  CitBook.let_type is used to indicate if it is a letter, manuscript, or thesis. If it is a manuscript, then CitBook.otherdata is a CharPtr which may be NULL, or point to a string with the manuscript-id.

Citing Directly Submitted Data

This form is used to cite the submission of data directly to a database, independent of any publication(s) which may be associated with the data as well. Authors (of the submission) and Date (in an Imprint) are required.  The Affiliation of the Authors should be filled in the Author-list. Optionally one may also record the medium in which the submission was made.

Citing a Patent

A full patent citation, Cit-pat conveys not only enough information to identify a patent (see below) but to characterize it somewhat as well.  A patent has a title and authors, the country in which the patent was issued, a document type and number, and the date the patent was issued.  Patents are grouped into classes based on the patent subject, and this may be useful to know. In addition, when a patent is first filed it is issued an application number (different from the document number assigned to the issued patent).  For tracking purposes, or issues of precedence, it is also helpful to know the application number and filing date.

The C structure, CitPat, is a straightforward mapping of the Cit-pat fields.

Identifying a Patent

When citing a patent, it may be sufficient to merely unambiguously identify it, on the assumption that more extensive information will be available from some other source, given the identifier.  Id-pat thus contains fields only for the country in which the patent was applied for, or issued in, then a CHOICE of the patent document number (if issued) or the application number (if pending).

The C structure, IdPat, is a straightforward mapping the Id-pat fields.

Citing an Article or Book which is In Press

A number of the fields in Cit-art and Cit-book are OPTIONAL, not only to allow incorporation of older, incomplete databases, but also to allow partial information for works submitted, or in press.  One simply fills in as many of the fields in Cit-art or Cit-book as possible.  One must also set the "pre-pub" flag in Imprint to the appropriate status.  That's it.  Once the work is published, the remaining information is filled in and the "pre-pub" flag is removed.  NOTE: this does NOT apply to work which is "unpublished" or "personal communication", or even "in preparation" because one knows nothing about where or when (or if) it will ever be published.  One must use a Cit-gen for this (below).

Special Cases: Unpublished, Unparsed, or Unusual

A generic citation, Cit-gen, is used to hold anything not fitting into the more usual bibliographic entities described above.  Cit-gen.cit is a string which can hold an unparsable citation (if you can parse it into a structured type, you should). Sometimes it is possible to parse some things but not everything.  In this case, a number of fields, such as authors, journal, etc., which are similar to those in the structured types, can be populated as much as possible, and the remainder of the unparsed string can go in "cit".

Less standard citation types, such as a MEDLINE unique identifier, or the serial numbers used in the GenBank flatfile can be accommodated by Cit-gen. An unpublished citation normally has authors and date filled into the structured fields.  Often a title is available as well (e.g. for a talk or for a manuscript in preparation). The string "unpublished" can then appear in the "cit" field.

Software developed to display or print a Cit-gen must be opportunistic about using whatever information is available.  Obviously it is not possible to assume that all Cit-gens can be displayed in a uniform manner, but in practice at NCBI we have found they can generally be made fairly regular.

Accommodating Any Publication Type

A Pub is the bibliographic object base class. It can accommodate a citation of any kind defined in the bibliographic specification, the MEDLINE specification, and more. It is very useful when one wishes to be able to associate a bibliographic reference in a very general way with a software tool or data item, yet still preserve the attributes specific for each class of citation.  Pub is widely used for this purpose in the NCBI specifications.

The C structures implement a Pub as a ValNode, where the choice gives the publication type and, in most cases, data.ptrvalue is a pointer to the appropriate data structure (and must be cast to the appropriate type for further use). The exception is for MEDLINE uid, which uses the data.intvalue field. The values are listed in objpub.h.

Grouping Different Forms of Citation for a Single Work

In some cases a database builder may wish to present more than one form of citation for the same bibliographic work. For example, in a sequence entry from the NCBI Backbone database, it is useful to provide the MEDLINE uid (for use as a link by other software tools), the Cit-art (for display to the user), and a Cit-gen containing the internal NCBI Backbone identifier for this publication as the string "pub_id = 188824" (for use in checking the database by in-house staff) for the same article. The Pub-equiv provides this capacity. It is a SET OF Pub. Each element in the SET is an equivalent citation for the same bibliographic work.  Software can examine the SET and select the form most appropriate to the job at hand.

A Pub-equiv is implemented as a linked list of ValNodes, where each ValNode is a Pub as described above. NOTE: a Pub of type Pub-equiv is a ValNode whose choice indicates pub-equiv and whose data.ptrvalue is the head of the linked list of ValNodes.

Sets of Citations

One often needs to collect a set of citations together.  Unlike the Pub-equiv (above), a Pub-set is a set of citations for DIFFERENT bibliographic works. It is a CHOICE of types for a mixture of publication classes, or for a collection of the same publication class.

A Pub-set is implemented as a ValNode, where the choice gives the type of the Pub-set and data.ptrvalue points to a linked list of ValNodes. The ValNodes are necessary to create the linked list. For convenience then, the choice of each ValNode is set appropriately for the type of bibliographic object it holds. This is only technically necessary for Pub-set of type "pub", but since it costs nothing all classes of Pub-set are done the same way.

Comparing Citations

Common question is whether two citations refer to same the publication. Note that this does not necessarily mean they are identical. For example a Medline-entry may refer to the same article as a Cit-art or a simple MEDLINE uid type of Pub. A series of xxxMatch() functions make this determination. Like strcmp() they return 0 if the two arguments refer to the same publication, 1 if the second argument comes after the first, or -1 if the first argument comes after the second. When possible, the ordering is based on some rational attribute of that Pub type, such as MEDLINE uid order. However, particularly when comparing different types of Pubs, the ordering is arbitrary, but unique. Thus the xxxMatch() functions can be used to sort various kinds of Pubs in the same list, or to locate Pubs in such an ordered list by binary search.

The most general function is PubMatch(a,b), which compares two Pubs of any type. PubEquivMatch(a,b) compares two PubEquivs only, CitArtMatch(a,b) compares two CitArts only, and so on.

ASN.1 Specification: biblio.asn

--$Revision: 1.2 $

--****************************************************************

--

--  NCBI Bibliographic data elements

--  by James Ostell, 1990

--

--  Taken from the American National Standard for

--      Bibliographic References

--      ANSI Z39.29-1977

--

--****************************************************************

 

NCBI-Biblio DEFINITIONS ::=

BEGIN

 

EXPORTS Cit-art, Cit-jour, Cit-book, Cit-pat, Cit-let, Id-pat, Cit-gen,

       Cit-proc, Cit-sub;

 

IMPORTS Person-id, Date FROM NCBI-General;

 

    -- Citation Types

 

Cit-art ::= SEQUENCE {                  -- article in journal or book

    title Title OPTIONAL ,              -- title of paper (ANSI requires)

    authors Auth-list OPTIONAL ,        -- authors (ANSI requires)

    from CHOICE {                       -- journal or book

        journal Cit-jour ,

        book Cit-book ,

        proc Cit-proc } }

 

Cit-jour ::= SEQUENCE {             -- Journal citation

    title Title ,                   -- title of journal

    imp Imprint }

 

Cit-book ::= SEQUENCE {              -- Book citation

    title Title ,                    -- Title of book

    coll Title OPTIONAL ,            -- part of a collection

    authors Auth-list,               -- authors

    imp Imprint }

 

Cit-proc ::= SEQUENCE {             -- Meeting proceedings

    book Cit-book ,                 -- citation to meeting

    meet Meeting }                  -- time and location of meeting

   

Cit-pat ::= SEQUENCE {                  -- patent citation

    title VisibleString ,

    authors Auth-list,                  -- authors

    country VisibleString ,             -- Patent Document Country

    doc-type VisibleString ,            -- Patent Document Type

    number VisibleString ,              -- Patent Document Number

    date-issue Date ,                   -- Patent-Issue Date

    class VisibleString OPTIONAL ,      -- Patent Doc Class Code

    app-number VisibleString OPTIONAL , -- Patent Doc Appl Number

    app-date Date OPTIONAL }            -- Patent Appl File Date

 

Id-pat ::= SEQUENCE {                   -- just to identify a patent

    country VisibleString ,             -- Patent Document Country

    id CHOICE {

        number VisibleString ,          -- Patent Document Number

        app-number VisibleString } }    -- Patent Doc Appl Number

 

Cit-let ::= SEQUENCE {                  -- letter, thesis, or manuscript

    cit Cit-book ,                      -- same fields as a book

    man-id VisibleString OPTIONAL ,     -- Manuscript identifier

    type ENUMERATED {

        manuscript (1) ,

        letter (2) ,

        thesis (3) } OPTIONAL }

                                -- NOTE: this is just to cite a

                                -- direct data submission, see NCBI-Submit

                                -- for the form of a sequence submission

Cit-sub ::= SEQUENCE {               -- citation for a direct submission

    authors Auth-list ,              -- not necessarily authors of the paper

    imp Imprint ,

    medium ENUMERATED {              -- medium of submission

        paper   (1) ,

        tape    (2) ,

        floppy  (3) ,

        email   (4) ,

        other   (255) } OPTIONAL }

   

Cit-gen ::= SEQUENCE {      -- NOT from ANSI, this is a catchall

    cit VisibleString OPTIONAL ,     -- anything, not parsable

    authors Auth-list OPTIONAL ,

    muid INTEGER OPTIONAL ,      -- medline uid

    journal Title OPTIONAL ,

    volume VisibleString OPTIONAL ,

    issue VisibleString OPTIONAL ,

    pages VisibleString OPTIONAL ,

    date Date OPTIONAL ,

    serial-number INTEGER OPTIONAL ,   -- for GenBank style references

   title VisibleString OPTIONAL }     -- eg. cit="unpublished",title="title"

   

   

    -- Authorship Group

Auth-list ::= SEQUENCE {

        names CHOICE {

            std SEQUENCE OF Author ,        -- full citations

            ml SEQUENCE OF VisibleString ,  -- MEDLINE, semi-structured

            str SEQUENCE OF VisibleString } , -- free for all

        affil Affil OPTIONAL }        -- author affiliation

 

Author ::= SEQUENCE {

    name Person-id ,                        -- Author, Primary or Secondary

    level ENUMERATED {

        primary (1),

        secondary (2) } OPTIONAL ,

    role ENUMERATED {                   -- Author Role Indicator

        compiler (1),

        editor (2),

        patent-assignee (3),

        translator (4) } OPTIONAL ,

    affil Affil OPTIONAL ,

   is-corr BOOLEAN OPTIONAL }          -- TRUE if corressponding author

 

Affil ::= CHOICE {

    str VisibleString ,                 -- unparsed string

    std SEQUENCE {                      -- std representation

    affil VisibleString OPTIONAL ,      -- Author Affiliation, Name

    div VisibleString OPTIONAL ,        -- Author Affiliation, Division

    city VisibleString OPTIONAL ,       -- Author Affiliation, City

    sub VisibleString OPTIONAL ,        -- Author Affiliation, County Sub

    country VisibleString OPTIONAL ,    -- Author Affiliation, Country

   street VisibleString OPTIONAL }}    -- street address, not ANSI

 

    -- Title Group

    -- Valid for = A = Analytic (Cit-art)

    --             J = Journals (Cit-jour)

    --             B = Book (Cit-book)

                                                 -- Valid for:

Title ::= SET OF CHOICE {

    name VisibleString ,    -- Title, Anal,Coll,Mono    AJB

    tsub VisibleString ,    -- Title, Subordinate       A B

    trans VisibleString ,   -- Title, Translated        AJB

    jta VisibleString ,     -- Title, Abbreviated        J

    iso-jta VisibleString , -- specifically ISO jta      J

    ml-jta VisibleString ,  -- specifically MEDLINE jta  J

    coden VisibleString ,   -- a coden                   J

    issn VisibleString ,    -- ISSN                      J

    abr VisibleString ,     -- Title, Abbreviated         B

    isbn VisibleString }    -- ISBN                       B

 

Imprint ::= SEQUENCE {                  -- Imprint group

    date Date ,                         -- date of publication

    volume VisibleString OPTIONAL ,

    issue VisibleString OPTIONAL ,

    pages VisibleString OPTIONAL ,

    section VisibleString OPTIONAL ,

    pub Affil OPTIONAL,                     -- publisher, required for book

    cprt Date OPTIONAL,                     -- copyright date, "    "   "

    part-sup VisibleString OPTIONAL ,       -- used in MEDLINE

    language VisibleString DEFAULT "ENG" ,  -- put here for simplicity

   prepub ENUMERATED {                     -- for prepublication citaions

       submitted (1) ,                     -- submitted, not accepted

       in-press (2) ,                    -- accepted, not published

       other (255)  } OPTIONAL }

 

Meeting ::= SEQUENCE {

    number VisibleString ,

    date Date ,

    place Affil OPTIONAL }

 

           

END

C Structures and Functions: objbibli.h

/*  objbibli.h

* ===========================================================================

*

*                            PUBLIC DOMAIN NOTICE                         

*               National Center for Biotechnology Information

*                                                                          

*  This software/database is a "United States Government Work" under the  

*  terms of the United States Copyright Act.  It was written as part of   

*  the author's official duties as a United States Government employee and

*  thus cannot be copyrighted.  This software/database is freely available

*  to the public for use. The National Library of Medicine and the U.S.   

*  Government have not placed any restriction on its use or reproduction. 

*                                                                          

*  Although all reasonable efforts have been taken to ensure the accuracy 

*  and reliability of the software and data, the NLM and the U.S.         

*  Government do not and cannot warrant the performance or results that   

*  may be obtained by using this software or data. The NLM and the U.S.   

*  Government disclaim all warranties, express or implied, including      

*  warranties of performance, merchantability or fitness for any particular

*  purpose.                                                                

*                                                                         

*  Please cite the author in any work or product based on this material.  

*

* ===========================================================================

*

* File Name:  objbibli.h

*

* Author:  James Ostell

*  

* Version Creation Date: 1/1/91

*

* $Revision: 1.2 $

*

* File Description:  Object manager interface for module NCBI-Biblio

*

* Modifications: 

* --------------------------------------------------------------------------

* Date    Name        Description of modification

* -------  ----------  -----------------------------------------------------

*

*

* ==========================================================================

*/

 

#ifndef _NCBI_Biblio_

#define _NCBI_Biblio_

 

#ifndef _ASNTOOL_

#include <asn.h>

#endif

#ifndef _NCBI_General_

#include <objgen.h>

#endif

 

#ifdef __cplusplus

extern "C" {

#endif

 

/*****************************************************************************

*

*   loader

*

*****************************************************************************/

extern Boolean BiblioAsnLoad PROTO((void));

 

/*****************************************************************************

*

*   Affil

*

*****************************************************************************/

typedef struct affil {

   Uint1 choice;           /* [1]=str,[2]=std */

   CharPtr affil,          /* also used for str */

       div,

       city,

       sub,

       country,

       street;

} Affil, PNTR AffilPtr;

 

extern AffilPtr AffilNew PROTO((void));

extern AffilPtr AffilFree PROTO((AffilPtr afp));

extern AffilPtr AffilAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean AffilAsnWrite PROTO((AffilPtr afp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   AuthList

*

*****************************************************************************/

typedef struct authors {

   Uint1 choice;        /* [1]=std, [2]=ml, [3]=str (only on Cit-art,gen) */

   ValNodePtr names;    /* the SEQUENCE OF */

   AffilPtr affil;

} AuthList, PNTR AuthListPtr;

 

extern AuthListPtr AuthListNew PROTO((void));

extern AuthListPtr AuthListFree PROTO((AuthListPtr asp));

extern AuthListPtr AuthListAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean AuthListAsnWrite PROTO((AuthListPtr afp, AsnIoPtr aip, AsnTypePtr atp));

Int2 AuthListMatch PROTO((AuthListPtr a, AuthListPtr b, Boolean all));

 

/*****************************************************************************

*

*   Author

*

*****************************************************************************/

typedef struct author {

   PersonIdPtr name;

   Uint1 lr[2];       /* level[0], role[1] as in spec. 0=not used */

   Uint1 is_corr;     /* corresponding author? 255=not set, 0=false, 1=true */

   AffilPtr affil;

} Author, PNTR AuthorPtr;

 

extern AuthorPtr AuthorNew PROTO((void));

extern AuthorPtr AuthorFree PROTO((AuthorPtr ap));

extern AuthorPtr AuthorAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean AuthorAsnWrite PROTO((AuthorPtr ap, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Cit-art

*

*****************************************************************************/

typedef struct citart {

   ValNodePtr title;       /* choice[1]=name,[2]=tsub,[3]=trans */

   AuthListPtr authors;

   Uint1 from;             /* [1]=journal,[2]=book,[3]=proc */

   Pointer fromptr;

} CitArt, PNTR CitArtPtr;

 

extern CitArtPtr CitArtNew PROTO((void));

extern CitArtPtr CitArtFree PROTO((CitArtPtr cap));

extern CitArtPtr CitArtAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitArtAsnWrite PROTO((CitArtPtr cap, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitArtMatch PROTO((CitArtPtr a, CitArtPtr b));

 

/*****************************************************************************

*

*   Imprint

*

*****************************************************************************/

typedef struct imprint {

   DatePtr date;

    CharPtr volume,

        issue,

        pages,

        section,

        part_sup,

        language;

    DatePtr cprt;     /* copy right date (for books) */

    AffilPtr pub;   /* publisher (for books)  */

   Uint1 prepub;   /* 0=not set 1=submitted 2=in-press 255=other */

} Imprint, PNTR ImprintPtr;

 

extern ImprintPtr ImprintNew PROTO((void));

extern ImprintPtr ImprintFree PROTO((ImprintPtr cap));

extern ImprintPtr ImprintAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean ImprintAsnWrite PROTO((ImprintPtr cap, AsnIoPtr aip, AsnTypePtr atp));

Int2 ImprintMatch PROTO((ImprintPtr a, ImprintPtr b, Boolean all));

 

/*****************************************************************************

*

*   Cit-jour

*

*****************************************************************************/

typedef struct citjour {

   ValNodePtr title;     /* choice in order of spec, 1=name,2=trans,etc */

   ImprintPtr imp;

} CitJour, PNTR CitJourPtr;

 

extern CitJourPtr CitJourNew PROTO((void));

extern CitJourPtr CitJourFree PROTO((CitJourPtr cjp));

extern CitJourPtr CitJourAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitJourAsnWrite PROTO((CitJourPtr cjp, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitJourMatch PROTO((CitJourPtr a, CitJourPtr b));

 

/*****************************************************************************

*

*   Cit-book

*

*****************************************************************************/

typedef struct citbook {

   ValNodePtr title,      /* choice in order of spec, 1=name, 2=tsub, etc */

             coll;       /* ditto */

   AuthListPtr authors;

   ImprintPtr imp;

   Uint1 othertype,      /* 0=Cit-book, 1=Cit-proc, 2=Cit-let */

       let_type;         /* if Cit-let, 1=manuscript,2=letter,3=thesis */

   Pointer otherdata;    /* NULL,  ValNodes, CharPtr man-id */

} CitBook, PNTR CitBookPtr;

 

extern CitBookPtr CitBookNew PROTO((void));

extern CitBookPtr CitBookFree PROTO((CitBookPtr cbp));

extern CitBookPtr CitBookAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitBookAsnWrite PROTO((CitBookPtr cbp, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitBookMatch PROTO((CitBookPtr a, CitBookPtr b));

 

/*****************************************************************************

*

*   Cit-sub

*      Direct submission of data

*

*****************************************************************************/

typedef struct citsub {

   AuthListPtr authors;

   ImprintPtr imp;

   Uint1 medium;

} CitSub, PNTR CitSubPtr;

 

extern CitSubPtr CitSubNew PROTO((void));

extern CitSubPtr CitSubFree PROTO((CitSubPtr cbp));

extern CitSubPtr CitSubAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitSubAsnWrite PROTO((CitSubPtr cbp, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitSubMatch PROTO((CitSubPtr a, CitSubPtr b));

 

 

/*****************************************************************************

*

*   Cit-proc

*     uses otherdata in Cit-book

*     chain of ValNodes

*     choice       ident      Pointer type

*       1          number      CharPtr

*       2          date        DatePtr

*       3          place       AffilPtr

*

*****************************************************************************/

extern CitBookPtr CitProcAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitProcAsnWrite PROTO((CitBookPtr cpp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Cit-let

*     uses otherdata in Cit-book as CharPtr for man-id

*

*****************************************************************************/

extern CitBookPtr CitLetAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitLetAsnWrite PROTO((CitBookPtr cpp, AsnIoPtr aip, AsnTypePtr atp));

 

 

 

/*****************************************************************************

*

*   Cit-pat

*

*****************************************************************************/

typedef struct citpat {

   CharPtr title;

   AuthListPtr authors;

   CharPtr country,

       doc_type,

       number;

   DatePtr date_issue;

   CharPtr _class,

       app_number;

   DatePtr app_date;

} CitPat, PNTR CitPatPtr;

 

extern CitPatPtr CitPatNew PROTO((void));

extern CitPatPtr CitPatFree PROTO((CitPatPtr cpp));

extern CitPatPtr CitPatAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitPatAsnWrite PROTO((CitPatPtr cpp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Id-pat

*

*****************************************************************************/

typedef struct idpat {

   CharPtr country,

       number,                           /** actually CHOICE of number or app_number */

       app_number;

} IdPat, PNTR IdPatPtr;

 

extern IdPatPtr IdPatNew PROTO((void));

extern IdPatPtr IdPatFree PROTO((IdPatPtr ipp));

extern IdPatPtr IdPatAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean IdPatAsnWrite PROTO((IdPatPtr ipp, AsnIoPtr aip, AsnTypePtr atp));

extern Boolean IdPatMatch PROTO((IdPatPtr a, IdPatPtr b));

 

/*****************************************************************************

*

*   Cit-gen

*

*****************************************************************************/

typedef struct cit_gen {

   CharPtr cit;

   AuthListPtr authors;

    Int4 muid;                  /* medline uid, -1 if not set */

    ValNodePtr journal;         /* journal/book Title */

    CharPtr volume,

        issue,

        pages;

   DatePtr date;

    Int2 serial_number;      /* for GenBank style references (-1 = not used)*/

   CharPtr title;           /* a specific title (in addition to cit or journal) */

} CitGen, PNTR CitGenPtr;

 

extern CitGenPtr CitGenNew PROTO((void));

extern CitGenPtr CitGenFree PROTO((CitGenPtr cgp));

extern CitGenPtr CitGenAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean CitGenAsnWrite PROTO((CitGenPtr cgp, AsnIoPtr aip, AsnTypePtr atp));

Int2 CitGenMatch PROTO((CitGenPtr a, CitGenPtr b, Boolean all));

 

/*****************************************************************************

*

*   Title

*

*****************************************************************************/

 

extern ValNodePtr TitleFree PROTO((ValNodePtr anp));

extern ValNodePtr TitleAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean TitleAsnWrite PROTO((ValNodePtr anp, AsnIoPtr aip, AsnTypePtr atp));

Int2 TitleMatch PROTO((ValNodePtr a, ValNodePtr b, Uint1 type));

 

#define Cit_title_name ( (Uint1) 1)

#define Cit_title_tsub ( (Uint1) 2)

#define Cit_title_trans ( (Uint1) 3)

#define Cit_title_jta ( (Uint1) 4)

#define Cit_title_iso_jta ( (Uint1) 5)

#define Cit_title_ml_jta ( (Uint1) 6)

#define Cit_title_coden ( (Uint1) 7)

#define Cit_title_issn ( (Uint1) 8)

#define Cit_title_abr ( (Uint1) 9)

#define Cit_title_isbn ( (Uint1) 10)

 

#ifdef __cplusplus

}

#endif

 

#endif

ASN.1 Specification: pub.asn

--$Revision: 1.2 $

--********************************************************************

--

--  Publication common set

--  James Ostell, 1990

--

--  This is the base class definitions for Publications of all sorts

--

--********************************************************************

 

NCBI-Pub DEFINITIONS ::=

BEGIN

 

EXPORTS Pub, Pub-set, Pub-equiv;

 

IMPORTS Medline-entry FROM NCBI-Medline

        Cit-art, Cit-jour, Cit-book, Cit-proc, Cit-pat, Id-pat, Cit-gen,

        Cit-let, Cit-sub FROM NCBI-Biblio;

 

Pub ::= CHOICE {

    gen Cit-gen ,        -- general or generic unparsed

    sub Cit-sub ,        -- submission

    medline Medline-entry ,

    muid INTEGER ,       -- medline uid

    article Cit-art ,

    journal Cit-jour ,

    book Cit-book ,

    proc Cit-proc ,      -- proceedings of a meeting

    patent Cit-pat ,

    pat-id Id-pat ,      -- identify a patent

    man Cit-let ,        -- manuscript, thesis, or letter

    equiv Pub-equiv }    -- to cite a variety of ways

 

Pub-equiv ::= SET OF Pub   -- equivalent identifiers for same citation

 

Pub-set ::= CHOICE {

    pub SET OF Pub ,

    medline SET OF Medline-entry ,

    article SET OF Cit-art ,

    journal SET OF Cit-jour ,

    book SET OF Cit-book ,

    proc SET OF Cit-proc ,      -- proceedings of a meeting

    patent SET OF Cit-pat }

 

END

C Structures and Functions: objpub.h

/*  objpub.h

* ===========================================================================

*

*                            PUBLIC DOMAIN NOTICE                          

*               National Center for Biotechnology Information

*                                                                         

*  This software/database is a "United States Government Work" under the  

*  terms of the United States Copyright Act.  It was written as part of   

*  the author's official duties as a United States Government employee and

*  thus cannot be copyrighted.  This software/database is freely available

*  to the public for use. The National Library of Medicine and the U.S.   

*  Government have not placed any restriction on its use or reproduction. 

*                                                                         

*  Although all reasonable efforts have been taken to ensure the accuracy 

*  and reliability of the software and data, the NLM and the U.S.         

*  Government do not and cannot warrant the performance or results that   

*  may be obtained by using this software or data. The NLM and the U.S.   

*  Government disclaim all warranties, express or implied, including      

*  warranties of performance, merchantability or fitness for any particular

*  purpose.                                                               

*                                                                          

*  Please cite the author in any work or product based on this material.  

*

* ===========================================================================

*

* File Name:  objpub.h

*

* Author:  James Ostell

*  

* Version Creation Date: 4/1/91

*

* $Revision: 1.2 $

*

* File Description:  Object manager interface for module NCBI-Pub

*

* Modifications: 

* --------------------------------------------------------------------------

* Date    Name        Description of modification

* -------  ----------  -----------------------------------------------------

*

*

* ==========================================================================

*/

 

#ifndef _NCBI_Pub_

#define _NCBI_Pub_

 

#ifndef _ASNTOOL_

#include <asn.h>

#endif

#ifndef _NCBI_Biblio_

#include <objbibli.h>

#endif

#ifndef _NCBI_Medline_

#include <objmedli.h>

#endif

 

#ifdef __cplusplus

extern "C" {

#endif

 

/*****************************************************************************

*

*   loader

*

*****************************************************************************/

extern Boolean PubAsnLoad PROTO((void));

 

/*****************************************************************************

*

*   internal structures for NCBI-Pub objects

*

*****************************************************************************/

 

/*****************************************************************************

*

*   Pub is a choice using an ValNode, most types in data.ptrvalue

*   choice:

*   0 = not set

    1 = gen Cit-gen ,        -- general or generic unparsed

    2 = sub Cit-sub ,        -- submission

    3 = medline Medline-entry ,

    4 = muid INTEGER ,       -- medline uid (stored in data.intvalue)

    5 = article Cit-art ,

    6 = journal Cit-jour ,

    7 = book Cit-book ,

    8 = proc Cit-proc ,      -- proceedings of a meeting

    9 = patent Cit-pat ,

    10 = pat-id Id-pat ,      -- identify a patent

    11 = man Cit-let         -- manuscript or letter

    12 = equiv Pub-equiv      -- set of equivalent citation forms for 1 pub

*

*****************************************************************************/

Boolean PubAsnWrite PROTO((ValNodePtr anp, AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubFree PROTO((ValNodePtr anp));

 

#define PUB_Gen 1

#define PUB_Sub 2

#define PUB_Medline 3

#define PUB_Muid 4

#define PUB_Article 5

#define PUB_Journal 6

#define PUB_Book 7

#define PUB_Proc 8

#define PUB_Patent 9

#define PUB_Pat_id 10

#define PUB_Man 11

#define PUB_Equiv 12

/****

*  Pub and PubEquiv Matching functions (same citation, not same form)

*   PubMatch() returns

*      0 = point to same citation

*       1,-1 = same pub type, but different

*       2,-2 = different put types, don't match

*   PubEquivMatch() returns

*      0 = point to same citation

*      1,-1 = point to different citations

*****/

Int2 PubMatch PROTO((ValNodePtr a, ValNodePtr b));

Int2 PubEquivMatch PROTO((ValNodePtr a, ValNodePtr b));

 

/*****************************************************************************

*

*   PubSet is a choice using an ValNode, PubSet->data.ptrvalue is chain of

*       Pubs (ValNodes) holding data for set for all types.

*   PubSet->choice:

*   0 = not set

    1 = pub Pub    -- set of real Pubs

                   -- the rest are implemented as Pubs anyway

    3 = medline Medline-entry ,

    5 = article Cit-art ,

    6 = journal Cit-jour ,

    7 = book Cit-book ,

    8 = proc Cit-proc ,      -- proceedings of a meeting

    9 = patent Cit-pat ,

*

*****************************************************************************/

Boolean PubSetAsnWrite PROTO((ValNodePtr anp, AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubSetAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubSetFree PROTO((ValNodePtr anp));

 

/*****************************************************************************

*

*   PubEquiv is just a chain of Pubs (ValNodes)

*

*****************************************************************************/

Boolean PubEquivAsnWrite PROTO((ValNodePtr anp, AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubEquivAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

ValNodePtr PubEquivFree PROTO((ValNodePtr anp));

 

#ifdef __cplusplus

}

#endif

 

#endif

 



MEDLINE Data


Introduction
Structure of a MEDLINE Entry
MeSH Index Terms
Substance Records
Database Cross Reference Records
Funding Identifiers
Gene Symbols
ASN.1 Specification: medline.asn
C Structures and Functions: objmedli.h


 Introduction

MEDLINE is the largest and oldest biomedical database in the world. It is built at the National Library of Medicine (NLM), a part of NIH. At this writing it contains over seven million citations from the scientific literature from over 3500 different journals. MEDLINE is a bibliographic database. It contains citation information (e.g. title, authors, journal, etc.). Many entries contain the abstract from the article. All articles are carefully indexed by professionals according to formal guidelines in a variety of ways. All entries can be uniquely identified by an integer key, the MEDLINE unique identifier (MEDLINE uid).

MEDLINE is a valuable resource in its own right. In addition, the MEDLINE uid can serve as a valuable link between entries in factual databases. When NCBI processes a new molecular biology factual database into the standardized format, we also normalize the bibliographic citations and attempt to map them to MEDLINE. For the biomedical databases we have tried thus far, we have succeeding in mapping most or all of the citations this way. From then on, linkage to other data objects can be made simply and easily through the share MEDLINE uid. The MEDLINE uid also allows movement from the data item to the world of scientific literature in general and back.

Structure of a MEDLINE Entry

Each Medline-entry represents a single article from the scientific literature. The MEDLINE uid is an INTEGER which uniquely identifies the entry. If corrections are made to the contents of the entry, the uid is not changed. The MEDLINE uid is the simplest and most reliable way to identify the entry.

The entry-month is the month and year in which the entry became part of the public view of MEDLINE. It is not the same as the date the article was published. It is mostly useful for tracking what is new since a previous query of MEDLINE.

The article citation itself is contained in a standard Cit-art, imported from the bibliographic module, so will not be discussed further here. The entry often contains the abstract from the article.  The rest of the entry consists of various index terms, which will be discussed below.

The C implementation of a MedlineEntry is straightforward.

MeSH Index Terms

Medical Subject Heading (MeSH) terms are a tree of controlled vocabulary maintained by the Library Operations division of NLM. The tree is arranged with parent terms above more specialized terms within the same concept. An entry in MEDLINE is indexed by the most specific MeSH term(s) available. Since the MeSH vocabulary is a tree, one may then query on specific terms directly, or on general terms by including all the child terms in the query as well.

A MeSH term may be qualified by one or more sub-headings. For example, the MeSH term "insulin" may carry quite a different meaning if qualified by "clinical trials" versus being qualified by "genetics".

A MeSH term or a sub-heading may be flagged as indicating the "main point" of the article. Again the most specific form is used. If the main point of the article was about insulin and they also discuss genetics, then the insulin MeSH term will be flagged but the genetics sub-heading will not be. However, if the main point of the article was the genetics of insulin, then the sub-heading genetics under the MeSH term insulin will be flagged but the MeSH term itself will not be.

Substance Records

If an article has substantial discussion of recognizable chemical compounds, they are indexed in the substance records. The record may contain only the name of the compound, or it may contain the name and a Chemical Abstracts Service (CAS) registry number or a Enzyme Commission (EC) number as appropriate.

Database Cross Reference Records

If an article cites an identifier recognized to be from a known list of biomedical databases, the cross reference is given in this field and the key for which database it was from. A typical example would be a GenBank accession number citing in an article.

Funding Identifiers

If an id number from a grant or contract is cited in the article (usually acknowledging support) it will appear in this field.

In the C structure, ValNodes are used to make a linked list of the CharPtrs to the strings.

Gene Symbols

As an experiment, Library Operations at the NLM is putting in mnemonic symbols from articles, if they appear by form and usage to be gene symbols. Obviously such symbols vary and are not always properly used, so this field must be approached with caution. Nonetheless it can provide a route to a rich source of potentially relevant citations.

ASN.1 Specification: medline.asn

--$Revision: 1.2 $

--**********************************************************************

--

--  MEDLINE data definitions

--  James Ostell, 1990

--

--**********************************************************************

 

NCBI-Medline DEFINITIONS ::=

BEGIN

 

EXPORTS Medline-entry;

 

IMPORTS Cit-art FROM NCBI-Biblio

        Date FROM NCBI-General;

 

                                -- a MEDLINE entry

Medline-entry ::= SEQUENCE {

    uid INTEGER ,               -- MEDLINE UID

    em Date ,                   -- Entry Month

    cit Cit-art ,               -- article citation

    abstract VisibleString OPTIONAL ,

    mesh SET OF Medline-mesh OPTIONAL ,

    substance SET OF Medline-rn OPTIONAL ,

    xref SET OF Medline-si OPTIONAL ,

    idnum SET OF VisibleString OPTIONAL ,  -- ID Number (grants, contracts)

    gene SET OF VisibleString OPTIONAL }

 

Medline-mesh ::= SEQUENCE {

    mp BOOLEAN DEFAULT FALSE ,       -- TRUE if main point (*)

    term VisibleString ,                   -- the MeSH term

    qual SET OF Medline-qual OPTIONAL }    -- qualifiers

 

Medline-qual ::= SEQUENCE {

    mp BOOLEAN DEFAULT FALSE ,       -- TRUE if main point

    subh VisibleString }             -- the subheading

 

Medline-rn ::= SEQUENCE {       -- medline substance records

    type ENUMERATED {           -- type of record

        nameonly (0) ,

        cas (1) ,               -- CAS number

        ec (2) } ,              -- EC number

    cit VisibleString OPTIONAL ,  -- CAS or EC number if present

    name VisibleString }          -- name (always present)

 

Medline-si ::= SEQUENCE {       -- medline cross reference records

    type ENUMERATED {           -- type of xref

        ddbj (1) ,              -- DNA Data Bank of Japan

        carbbank (2) ,          -- Carbohydrate Structure Database

        embl (3) ,              -- EMBL Data Library

        hdb (4) ,               -- Hybridoma Data Bank

        genbank (5) ,           -- GenBank

        hgml (6) ,              -- Human Gene Map Library

        mim (7) ,               -- Mendelian Inheritance in Man

        msd (8) ,               -- Microbial Strains Database

        pdb (9) ,               -- Protein Data Bank (Brookhaven)

        pir (10) ,              -- Protein Identification Resource

        prfseqdb (11) ,         -- Protein Research Foundation (Japan)

        psd (12) ,              -- Protein Sequence Database (Japan)

        swissprot (13) } ,      -- SwissProt

    cit VisibleString OPTIONAL }    -- the citation/accession number

 

END

C Structures and Functions: objmedli.h

/*  objmedli.h

* ===========================================================================

*

*                            PUBLIC DOMAIN NOTICE                         

*               National Center for Biotechnology Information

*                                                                         

*  This software/database is a "United States Government Work" under the  

*  terms of the United States Copyright Act.  It was written as part of   

*  the author's official duties as a United States Government employee and

*  thus cannot be copyrighted.  This software/database is freely available

*  to the public for use. The National Library of Medicine and the U.S.   

*  Government have not placed any restriction on its use or reproduction. 

*                                                                          

*  Although all reasonable efforts have been taken to ensure the accuracy 

*  and reliability of the software and data, the NLM and the U.S.         

*  Government do not and cannot warrant the performance or results that   

*  may be obtained by using this software or data. The NLM and the U.S.   

*  Government disclaim all warranties, express or implied, including      

*  warranties of performance, merchantability or fitness for any particular

*  purpose.                                                                

*                                                                         

*  Please cite the author in any work or product based on this material.  

*

* ===========================================================================

*

* File Name:  objmedli.h

*

* Author:  James Ostell

*  

* Version Creation Date: 1/1/91

*

* $Revision: 1.2 $

*

* File Description:  Object manager interface for module NCBI-Medline

*

* Modifications: 

* --------------------------------------------------------------------------

* Date    Name        Description of modification

* -------  ----------  -----------------------------------------------------

*

*

* ==========================================================================

*/

 

#ifndef _NCBI_Medline_

#define _NCBI_Medline_

 

#ifndef _ASNTOOL_

#include <asn.h>

#endif

#ifndef _NCBI_General_

#include <objgen.h>

#endif

#ifndef _NCBI_Biblio_

#include <objbibli.h>

#endif

 

#ifdef __cplusplus

extern "C" {

#endif

 

/*****************************************************************************

*

*   loader

*

*****************************************************************************/

extern Boolean MedlineAsnLoad PROTO((void));

 

/*****************************************************************************

*

*    Medline-mesh

*

*****************************************************************************/

typedef struct mesh {

    Boolean mp;                   /* main point */

    CharPtr term;

    ValNodePtr qual;

    struct mesh PNTR next;

 } MedlineMesh, PNTR MedlineMeshPtr;

 

extern MedlineMeshPtr MedlineMeshNew PROTO((void));

extern MedlineMeshPtr MedlineMeshFree PROTO((MedlineMeshPtr mmp));

extern MedlineMeshPtr MedlineMeshAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean MedlineMeshAsnWrite PROTO((MedlineMeshPtr mmp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*    Medline-rn

*

*****************************************************************************/

typedef struct rn {

    Uint1 type;               

    CharPtr cit,

            name;

    struct rn PNTR next;

 } MedlineRn, PNTR MedlineRnPtr;

 

extern MedlineRnPtr MedlineRnNew PROTO((void));

extern MedlineRnPtr MedlineRnFree PROTO((MedlineRnPtr mrp));

extern MedlineRnPtr MedlineRnAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean MedlineRnAsnWrite PROTO((MedlineRnPtr mrp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*    Medline-si

*      ValNode used for structure

*

*****************************************************************************/

 

extern ValNodePtr MedlineSiAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean MedlineSiAsnWrite PROTO((ValNodePtr msp, AsnIoPtr aip, AsnTypePtr atp));

 

/*****************************************************************************

*

*   Medline-entry

*

*****************************************************************************/

typedef struct medline {

    Int4 uid;

    DatePtr em;

    CitArtPtr cit;

    CharPtr abstract;

    MedlineMeshPtr mesh;

    MedlineRnPtr substance;

    ValNodePtr xref;

    ValNodePtr idnum;

    ValNodePtr gene;

} MedlineEntry, PNTR MedlineEntryPtr;

 

extern MedlineEntryPtr MedlineEntryNew PROTO((void));

extern MedlineEntryPtr MedlineEntryFree PROTO((MedlineEntryPtr mep));

extern MedlineEntryPtr MedlineEntryAsnRead PROTO((AsnIoPtr aip, AsnTypePtr atp));

extern Boolean MedlineEntryAsnWrite PROTO((MedlineEntryPtr mep, AsnIoPtr aip, AsnTypePtr atp));

 

#ifdef __cplusplus

}

#endif

 



Biological Sequences


Introduction
Bioseq: the Biological Sequence
Seq-id: Identifying the Bioseq
Seq-annot: Annotating the Bioseq
Seq-descr: Describing the Bioseq and Placing It In Context
Seq-inst: Instantiating the Bioseq
Seq-hist: History of a Seq-inst
Seq-data: Encoding the Sequence Data Itself
Tables of Sequence Codes
Mapping Between Different Sequence Alphabets
Data and Tools for Sequence Alphabets
Pubdesc: Publication Describing a Bioseq
Numbering: Applying a Numbering System to a Bioseq
ASN.1 Specification: seq.asn
ASN.1 Specification: seqblock.asn
ASN.1 Specification: seqcode.asn
C Structures and Functions: objseq.h
C Structures and Functions: objpubd.h
C Structures and Functions: objblock.h
C Structures and Functions: objcode.h


 Introduction

A biological sequence is a single, continuous molecule of nucleic acid or protein. It can be thought of as a multiple inheritance class hierarchy. One hierarchy is that of the underlying molecule type: DNA, RNA, or protein. The other hierarchy is the way the underlying biological sequence is represented by the data structure. It could be a physical or genetic map, an actual sequence of amino acids or nucleic acids, or some more complicated data structure building a composite view from other entries. An overview of this data model has been presented previously, in the Data Model chapter. The overview will not be repeated here so if you have not read that chapter, do so now. This chapter will concern itself with the details of the specification and representation of biological sequence data.

Bioseq: the Biological Sequence

A Bioseq represents a single, continuous molecule of nucleic acid or protein. It can be anything from a band on a gel to a complete chromosome. It can be a genetic or physical map. All Bioseqs have more common properties than differences. All Bioseqs must have at least one identifier, a Seq-id (i.e. Bioseqs must be citable). Seq-ids are discussed in detail in the chapter Sequence Ids and Locations. All Bioseqs represent an integer coordinate system (even maps). All positions on Bioseqs are given by offsets from the first residue, and thus fall in the range from zero to (length - 1). All Bioseqs may have specific descriptive data elements (descriptors) and/or annotations such as feature tables, alignments, or graphs associated with them.

The differences in Bioseqs arise primarily from the way they are instantiated (represented). Different data elements are required to represent a map than are required to represent a sequence of residues.

The C structure for a Bioseq has pointers for a linked list of Seq-ids, a linked list of Seq-descr, and a linked list of Seq-annot, mapping quite directly from the ASN.1. However, since a Seq-inst is always required for a Bioseq, those fields have been incorporated into the Bioseq itself. There are SeqInstAsnRead() and SeqInstAsnWrite() as separate functions, but they take a pointer to a Bioseq.

A number of #defines are provided in objseq.h for the representation classes, molecule types, and types of sequence encoding used in the Bioseq C structure. Also the macros ISA_na() and ISA_aa() are provided to split Bioseqs into the two major molecule classes. A Bioseq.length equal to -1 means the length is unknown and will not appear in the ASN.1. When actual sequence data is present, Bioseq.seq_data holds the pointer to it. Bioseq.seq_data_type contains a value indicating the type of sequence encoding used (and thus the pointer type to cast Bioseq.seq_data to). Sequence encoding is discussed in more detail below.

Seq-id: Identifying the Bioseq

Every Bioseq MUST have at least one Seq-id, or sequence identifier. This means a Bioseq is always citable. You can refer to it by a label of some sort. This is a crucial property for different software tools or different scientists to be able to talk about the same thing. There is a wide range of Seq-ids and they are used in different ways. They are discussed in more detail in the Sequence Ids and Locations chapter.

Seq-annot: Annotating the Bioseq

A Seq-annot is a self-contained package of sequence annotations, or information that refers to specific locations on specific Bioseqs. Every Seq-annot can have an Object-id for local use by software, a Dbtag for globally identifying the source of the Seq-annot, and/or a name and description for display and use by a human. These describe the whole package of annotations and make it attributable to a source, independent of the source of the Bioseq.

A Seq-annot may contain a feature table, a set of sequence alignments, or a set of graphs of attributes along the sequence. These are described in detail in the Sequence Annotation chapter.

A Bioseq may have many Seq-annots. This means it is possible for one Bioseq to have feature tables from several different sources, or a feature table and set of alignments. A collection of sequences (see Sets Of Bioseqs) can have Seq-annots as well. Finally, a Seq-annot can stand alone, not directly attached to anything. This is because each element in the Seq-annot has specific references to locations on Bioseqs so the information is very explicitly associated with Bioseqs, not implicitly associated by attachment. This property makes possible the exchange of information about Bioseqs as naturally as the exchange of the Bioseqs themselves, be it among software tools or between scientists or as contributions to public databases.

Seq-descr: Describing the Bioseq and Placing It In Context

A Seq-descr is meant to describe a Bioseq (or set of Bioseqs.. see Sets Of Bioseqs) and place it in a biological and/or bibliographic context. Seq-descrs apply to the whole Bioseq. Some Seq-descr classes appear also as features, when used to describe a specific part of a Bioseq. But anything appearing at the Seq-descr level applies to the whole thing.

The C implementation uses a linked list of ValNodes, where the ValNode.choice indicates what kind of Seq-descr this is, and ValNode.data contains either an integer or pointer depending on the type of descriptor. The file objseq.h lists the choices and data types and is summarize in the following table. Under Value is the value of ValNode.choice. Type gives an indication of the data stored in ValNode.data. If "i", then an integer is stored in valnode->data.intvalue. Otherwise a pointer is stored in valnode->data.ptrvalue and the datatype of the pointer is given. The file objseq.h also has a series of #defines for Value below constructed by prefixing "Seq_descr_" to the Name below and replacing any hyphens (-) in the ASN.1 name with underline (_) to make it legal C (e.g. #define Seq_descr_mol_type 1).

Seq-descr

Value

Name

Type

Explanation

1

mol-type

i

role of molecule in life

2

modif

ValNodePtr

modifying keywords of mol-type

3

method

i

protein sequencing method used

4

name

CharPtr

a commonly used name (e.g. "SV40")

5

title

CharPtr

a descriptive title or definition

6

org

OrgRefPtr

(single) organism from which mol comes

7

comment

CharPtr

descriptive comment (may have many)

8

num

NumberingPtr

a numbering system for whole Bioseq

9

maploc

DbtagPtr

a map location from a mapping database

10

pir

PirBlockPtr

PIR specific data

11

genbank

GBBlockPtr

GenBank flatfile specific data

12

pub

PubdescPtr

Publication citation and descriptive info from pub

13

region

CharPtr

name of genome region (e.g. B-globin cluster)

14

user

UserObjectPtr

user defined data object for any purpose

15

sp

SPBlockPtr

SWISSPROT specific data

16

neighbors

LinkSetPtr

ids of pre-calculated similar sequences

17

embl

EMBLBlockPtr

EMBL specific data

18

create-date

DatePtr

date entry was created by source database

19

update-date

DatePtr

date entry last updated by source database

20

prf

PrfBlockPtr

PRF specific data

21

pdb

PdbBlockPtr

PDB specific data

22

het

CharPtr

heterogen: non-Bioseq atom/molecule

mol-type: The Molecule Type

A Seq-descr.mol-type is of type GIBB-mol. It is derived from the molecule information used in the GenInfo BackBone database. It indicates the biological role of the Bioseq in life. It can be genomic (including organelle genomes). It can be a transcription product such as pre-mRNA, mRNA, rRNA, tRNA, snRNA (small nuclear RNA), or scRNA (small cytoplasmic RNA). All amino acid sequences are peptides. No distinction is made at this level about the level of processing of the peptide (but see Prot-ref in the Sequence Annotations chapter). The type other-genetic is provided for "other genetic material" such a B chromosomes or F factors that are not normal genomic material but are also not transcription products. The type genomic-mRNA is provided to describe sequences presented in figures in papers in which the author has combined genomic flanking sequence with cDNA sequence. Since such a figure often does not accurately reflect either the sequence of the mRNA or the sequence of genome, this practice should be discouraged.

Since GIBB-mol is an ENUMERATED type, the ValNode for the Seq-descr simply places the enumerated value in ValNode.data.intvalue.

modif: Modifying Our Assumptions About a Bioseq

A GIBB-mod began as a GenInfo BackBone component and was found to be of general utility. A GIBB-mod is meant to modify the assumptions one might make about a Bioseq. If a GIBB-mod is not present, it does not mean it does not apply, only that it is part of a reasonable assumption already. For example, a Bioseq with GIBB-mol = genomic would be assumed to be DNA, to be chromosomal, and to be partial (complete genome sequences are still rare). If GIBB-mod = mitochondrial and GIBB-mod = complete are both present in Seq-descr, then we know this is a complete mitochondrial genome. Even though GIBB-mod = DNA  is not present we can still assume it is DNA.

The modifier concept permits a lot of flexibility. So a peptide with GIBB-mod = mitochondrial is a mitochondrial protein. There is no implication that it is from a mitochondrial gene only that it functions in the mitochondrion. The assumption is that peptide sequences are complete, so GIBB-mod = complete is not necessary for most proteins, but GIBB-mod = partial is important information for some. A list of brief explanations of GIBB-mod values follows:

GIBB-mod

Value

Name

Explanation

0

dna

molecule is DNA in life

1

rna

molecule is RNA in life

2

extrachrom

molecule is extrachromosomal

3

plasmid

molecule is or is from a plasmid

4

mitochondrial

molecule is from mitochondrion

5

chloroplast

molecule is from chloroplast

6

kinetoplast

molecule is from kinetoplast

7

cyanelle

molecule is from cyanelle

8

synthetic

molecule was synthesized artificially

9

recombinant

molecule was formed by recombination

10

partial

not a complete sequence for molecule

11

complete

sequence covers complete molecule

12

mutagen

molecule subjected to mutagenesis

13

natmut

molecule is a naturally occurring mutant

14

transposon

molecule is a transposon

15

insertion-seq

molecule is an insertion sequence

16

no-left

partial molecule is missing left end

5' end for nucleic acid, NH3 end for peptide

17

no-right

partial molecule is missing right end

3' end for nucleic acid, COOH end for peptide

18

macronuclear

molecule is from macronucleus

19

proviral

molecule is an integrated provirus

20

est

molecule is an expressed sequence tag

Seq-descr.modif is defined as a SET OF GIBB-mod, so it must be implemented as a chain, not as a single value. The ValNode representing a Seq-descr.modif then has ValNode.choice = Seq_descr_modif and a ValNode.data.ptrvalue is the head of a chain of ValNodes. Each member of that chain has a ValNode.data.intvalue set to represent a single GIBB-mod according to the table above.

method: Protein Sequencing Method

The method Seq-descr gives the method used to obtain a protein sequence. The values for a GIBB-method are also stored in the C structure as integer values mapping directly from the ASN.1 ENUMERATED type. They are:

GIBB-method

Value

Name

Explanation

1

concept-trans

conceptual translation

2

seq-pept

peptide itself was sequenced

3

both

conceptual translation with partial peptide sequencing

4

seq-pept-overlap

peptides sequenced, fragments ordered by overlap

5

seq-pept-homol

peptides sequenced, fragments ordered by homology

6

concept-trans-a

conceptual translation, provided by author of sequence

name: A Descriptive Name

A sequence name is very different from a sequence identifier. A Seq-id uniquely identifies a specific Bioseq. A Seq-id may be no more than an integer and will not necessarily convey any biological or descriptive information in itself. A name is not guaranteed to uniquely identify a single Bioseq, but if used with caution, can be a very useful tool to identify the best current entry for a biological entity. For example, we may wish to associate the name "SV40" with a single Bioseq for the complete genome of SV40. Let us suppose this Bioseq has the Seq-id 10. Then it is discovered that there were errors in the original Bioseq designated 10, and it is replaced by a new Bioseq from a curator with Seq-id 15. The name "SV40" can be moved to Seq-id 15 now. If a biologist wishes to see the "best" or "most typical" sequence of the SV40 genome, she would retrieve on the name "SV40". At an earlier point in time she would get Bioseq 10. At a later point she would get Bioseq 15. Note that her query is always answered in the context of best current data. On the other hand, if she had done a sequence analysis on Bioseq 10 and wanted to compare results, she would cite Seq-id 10, not the name "SV40", since her results apply to the specific Bioseq, 10, not necessarily to the "best" or "most typical" entry for the virus at the moment.

title: A Descriptive Title

A title is a brief, generally one line, description of an entry. It is extremely useful when presenting lists of Bioseqs returned from a query or search. This is the same as the familiar GenBank flatfile DEFINITION line.

 Because of the utility of such terse summaries, NCBI has been experimenting with algorithmically generated titles which try to pack as much information as possible into a single line in a regular and readable format. You will see titles of this form appearing on entries produced by the NCBI journal scanning component of GenBank.

DEFINITION  atp6=F0-ATPase subunit 6 {RNA edited} [Brassica napus=rapeseed,

            mRNA Mitochondrial, 905 nt]

 

DEFINITION  mprA=metalloprotease, mprR=regulatory protein [Streptomyces

            coelicolor, Muller DSM3030, Genomic, 3 genes, 2040 nt]

 

DEFINITION  pelBC gene cluster: pelB=pectate lyase isozyme B, pelC=pectate

            lyase isozyme C [Erwinia chrysanthemi, 3937, Genomic, 2481 nt]

 

DEFINITION  glycoprotein J...glycoprotein I [simian herpes B virus SHBV,

            prototypic B virus, Genomic, 3 genes, 2652 nt]

 

DEFINITION  glycoprotein B, gB [human herpesvirus-6 HHV6, GS, Peptide, 830

               aa]

 

DEFINITION  {pseudogene} RESA-2=ring-infected erythrocyte surface antigen 2

            [Plasmodium falciparum, FCR3, Genomic, 3195 nt]

 

DEFINITION  microtubule-binding protein tau {exons 4A, 6, 8 and 13/14} [human,

            Genomic, 954 nt, segment 1 of 4]

 

DEFINITION  CAD protein carbamylphosphate synthetase domain {5' end} [Syrian

            hamsters, cell line 165-28, mRNA Partial, 553 nt]

 

DEFINITION  HLA-DPB1 (SSK1)=MHC class II antigen [human, Genomic, 288 nt]

Gene and protein names come first. If both gene name and protein name are know they are linked with "=". If more than two genes are on a Bioseq then the first and last gene are given, separated by "...". A region name, if available, will precede the gene names. Extra comments will appear in {}. Organism, strain names, and molecule type and modifier appear in [] at the end. Note that the whole definition is constructed from structured information in the ASN.1 data structure by software. It is not composed by hand, but is instead a brief, machine generated summary of the entry based on data within the entry. We therefore discourage attempts to machine parse this line. It may change, but the underlying structured data will not. Software should always be designed to process the structured data.

org: What Organism Did this Come From?

If the whole Bioseq comes from a single organism (the usual case). See the Feature Table chapter for a detailed description of the Org-ref (organism reference) data structure.

comment: Commentary Text

A comment that applies to the whole Bioseq may go here. A comment may contain many sentences or paragraphs. A Bioseq may have many comments.

num: Applying a Numbering System to a Bioseq

One may apply a custom numbering system over the full length of the Bioseq with this Seq‑descr. See the section on Numbering later in this chapter for a detailed description of the possible forms this can take. To report the numbering system used in a particular publication, the Pubdesc Seq-descr has its own Numbering slot.

maploc: Map Location

The map location given here is a Dbtag, to be able to cite a map location given by a map database to this Bioseq (e.g. "GDB", "4q21"). It is not necessarily the map location published by the author of the Bioseq. A map location published by the author would be part of a Pubdesc Seq-descr.

pir: PIR Specific Data

sp: SWISSPROT Data

embl: EMBL Data

prf: PRF Data

pdb: PDB Data

NCBI produces ASN.1 encoded entries from data provided by many different sources. Almost all of the data items from these widely differing sources are mapped into the common ASN.1 specifications described in this document. However, in all cases a small number of elements are unique to a particular data source, or cannot be unambiguously mapped into the common ASN.1 specification. Rather than lose such elements, they are carried in small data structures unique to each data source. These are specified in seqblock.asn and objblock.h.

genbank: GenBank Flatfile Specific Data

A number of data items unique to the GenBank flatfile format do not map readily to the common ASN.1 specification. These fields are partially populated by NCBI for Bioseqs derived from other sources than GenBank to permit the production of valid GenBank flatfile entries from those Bioseqs. Other fields are populated to preserve information coming from older GenBank entries.

pub: Description of a Publication

This Seq-descr is used both to cite a particular bibliographic source and to carry additional information about the Bioseq as it appeared in that publication, such as the numbering system to use, the figure it appeared in, a map location given by the author in that paper, and so. See the section on the Pubdesc later in this chapter for a more detailed description of this data type.

region: Name of a Genomic Region

A region of genome often has a name which is a commonly understood description for the Bioseq, such as "B-globin cluster".

user: A User-defined Structured Object

This is a place holder for software or databases to add their own structured datatypes to Bioseqs without corrupting the common specification or disabling the automatic ASN.1 syntax checking. A User-object can also be used as a feature. See the chapter on General User Objects for a detailed explanation of User-objects.

neighbors: Bioseqs Related by Sequence Similarity

NCBI computes a list of "neighbors", or closely related Bioseqs based on sequence similarity for use in the Entrez service. This descriptor is so that such context setting information could be included in a Bioseq itself, if desired.

create-date:

This is the date a Bioseq was created for the first time. It is normally supplied by the source database. It may not be present when not normally distributed by the source database.

update-date:

This is the date of the last update to a Bioseq by the source database. For several source databases this is the only date provided with an entry. The nature of the last update done is generally not available in computer readable (or any) form.

het: Heterogen

A "heterogen" is a non-biopolymer atom or molecule associated with Bioseqs from PDB. When a heterogen appears at the Seq-descr level, it means it was resolved in the crystal structure but is not associated with specific residues of the Bioseq. Heterogens which are associated with specific residues of the Bioseq are attached as features.

Seq-inst: Instantiating the Bioseq

Seq-inst.mol gives the physical type of the Bioseq in the living organism. If it is not certain if the Bioseq is DNA (dna) or RNA (rna), then (na) can be used to indicate just "nucleic acid". A protein is always (aa) or "amino acid". The values "not-set" or "other" are provided for internal use by editing and authoring tools, but should not be found on a finished Bioseq being sent to an analytical tool or database.

The representation class to which the Bioseq belongs is encoded in Seq-inst.repr. The values "not-set" or "other" are provided for internal use by editing and authoring tools, but should not be found on a finished Bioseq being sent to an analytical tool or database. The Data Model chapter discusses the representation class hierarchy in general. Specific details follow below.

Seq-inst: Virtual Bioseq

A "virtual" Bioseq is one in which we know the type of molecule, and possibly it's length, topology, and/or strandedness, but for which we do not have sequence data. It is not unusual to have some uncertainty about the length of a virtual Bioseq, so Seq-inst.fuzz may be used. The fields Seq-inst.seq-data and Seq-inst.ext are not appropriate for a virtual Bioseq.

Seq-inst: Raw Bioseq

A "raw" Bioseq does have sequence data, so Seq-inst.length must be set and there should be no Seq-inst.fuzz associated with it. Seq-inst.seq-data must be filled in with the sequence itself and a Seq-data encoding must be selected which is appropriate to Seq-inst.mol. The topology and strandedness may or may not be available. Seq-inst.ext is not appropriate.

Seq-inst: Segmented Bioseq

A segmented ("seg") Bioseq has all the properties of a virtual Bioseq, except that Seq-hist.ext of type Seq-ext.seg must be used to indicate the pieces of other Bioseqs to assemble to make the segmented Bioseq. A Seq-ext.seg is defined as a SEQUENCE OF Seq-loc, or a series of locations on other Bioseqs, taken in order.

For example, a segmented Bioseq (called "X") has a SEQUENCE OF Seq-loc which are an interval from position 11 to 20 on Bioseq "A" followed by an interval from position 6 to 15 on Bioseq "B". So "X" is a Bioseq with no internal gaps which is 20 residues long (no Seq-inst.fuzz). The first residue of "X" is the residue found at position 11 in "A". To obtain this residue, software must retrieve Bioseq "A" and examine the residue at "A" position 11. The segmented Bioseq contains no sequence data itself, only pointers to where to get the sequence data and what pieces to assemble in what order.

The type of segmented Bioseq described above might be used to represent the putative mRNA by simply pointing to the exons on two pieces of genomic sequence. Suppose however, that we had only sequenced around the exons on the genomic sequence, but wanted to represent the putative complete genomic sequence.  Let us assume that Bioseq "A" is the genomic sequence of the first exon and some small amount of flanking DNA, and that Bioseq "B" is the genomic sequence around the second exon. Further, we may know from mapping that the exons are separated by about two kilobases of DNA. We can represent the genomic region by creating a segmented sequence in which the first location is all of Bioseq "A". The second location will be all of a virtual Bioseq (call it "C") whose length is two thousand and which has a Seq-inst.fuzz representing whatever uncertainty we may have about the exact length of the intervening genomic sequence. The third location will be all of Bioseq "B". If "A" is 100 base pairs long and "B" is 200 base pairs, then the segmented entry is 2300 base pairs long ("A"+"C"+"B") and has the same Seq-inst.fuzz as "C" to express the uncertainty of the overall length.

A variation of the case above is when one has no idea at all what the length of the intervening genomic region is. A segmented Bioseq can also represent this case. The Seq-inst.ext location chain would be first all of "A", then a Seq-loc of type "null", then all of "B". The "null" indicates that there is no available information here. The length of the segmented Bioseq is just the sum of the length of "A" and the length of "B", and Seq-inst.fuzz is set to indicate the real length is greater-than the length given. The "null" location does not add to the overall length of the segmented Bioseq and is ignored in determining the integer value of a location on the segmented Bioseq itself. If "A" is 100 base pairs long and "B" is 50 base pairs long, then position 0 on the segmented Bioseq is equivalent to the first residue of "A" and position 100 on the segmented Bioseq is equivalent to the first residue of "B", despite the intervening "null" location indicating the gap of unknown length. Utility functions such as the SeqPort (described in the Sequence Utilities chapter) can be configured to signal when crossing such boundaries, or to ignore them.

The Bioseqs referenced by a segmented Bioseq should always be from the same Seq-inst.mol class as the segmented Bioseq, but may well come from a mixture of Seq-inst.repr classes (as for example the mixture of virtual and raw Bioseq references used to describe sequenced and unsequenced genomic regions above). Other reasonable mixtures might be raw and map (see below) Bioseqs to describe a region which is fully mapped and partially sequenced, or even a mixture of virtual, raw, and map Bioseqs for a partially mapped and partially sequenced region. The "character" of any region of a segmented Bioseq is always taken from the underlying Bioseq to which it points in that region. However, a segmented Bioseq can have its own annotations. Things like feature tables are not automatically propagated to the segmented Bioseq.

Seq-inst: Reference Bioseq

A reference Bioseq is effectively a segmented Bioseq with only one pointer location. It behaves exactly like a segmented Bioseq in taking its data and "character" from the Bioseq to which it points. Its purpose is not to construct a new Bioseq from others like a segmented Bioseq, but to refer to an existing Bioseq. It could be used to provide a convenient handle to a frequently used region of a larger Bioseq. Or it could be used to develop a customized, personally annotated view of a Bioseq in a public database without losing the "live" link to the public sequence.

In the first example, software would want to be able to use the Seq-loc to gather up annotations and descriptors for the region and display them to user with corrections to align them appropriately to the sub region. In this form, a scientist my refer to the "lac region" by name, and analyze or annotate it as if it were a separate Bioseq, but each retrieve starts with a fresh copy of the underlying Bioseq and annotations, so corrections or additions made to the underlying Bioseq in the public database will be immediately visible to the scientist, without either having to always look at the whole Bioseq or losing any additional annotations the scientist may have made on the region themselves.

In the second example, software would not propagate annotations or descriptors from the underlying Bioseq by default (because presumably the scientist prefers his own view to the public one) but the connection to the underlying Bioseq is not lost. Thus the public annotations are available on demand and any new annotations added by the scientist share the public coordinate system and can be compared with those done by others.

Seq-inst: Constructed Bioseq

A constructed (const) Bioseq inherits all the attributes of a raw Bioseq. It is used to represent a Bioseq which has been constructed by assembling other Bioseqs. In this case the component Bioseqs normally overlap each other and there may be considerable redundancy of component Bioseqs. A constructed Bioseq is often also called a "contig" or a "merge".

Most raw Bioseqs in the public databases were constructed by merging overlapping gel or sequencer readings of a few hundred base pairs each. While the const Bioseq data structure can easily accommodate this information, the const Bioseq data type was not really intended for this purpose. It was intended to represent higher level merges of public sequence data and private data, such as when a number of sequence entries from different authors are found to overlap or be contained in each other. In this case a view of the larger sequence region can be constructed by merging the components. The relationship of the merge to the component Bioseqs is preserved in the constructed Bioseq, but it is clear that the constructed Bioseq is a "better" or "more complete" view of the overall region, and could replace the component Bioseqs in some views of the sequence database. In this way an author can submit a data structure to the database which in this author's opinion supersedes his own or other scientist's database entries, without the database actually dropping the other author's entries (who may not necessarily agree with the author submitting the constructed Bioseq).

The constructed Bioseq is like a raw, rather than a segmented, Bioseq because Seq-inst.seq-data must be present. The sequence itself is part of the constructed Bioseq.  This is because the component Bioseqs may overlap in a number of ways, and expert knowledge or voting rules may have been applied to determine the "correct" or "best" residue from the overlapping regions. The Seq-inst.seq-data contains the sequence which is the final result of such a process.

Seq-inst.ext is not used for the constructed Bioseq. The relationship of the merged sequence to its component Bioseqs is stored in Seq-inst.hist, the history of the Bioseq (described in more detail below). Seq-hist.assembly contains alignments of the constructed Bioseq with its component Bioseqs. Any Bioseq can have a Seq-hist.assembly. A raw Bioseq may use this to show its relationship to its gel readings. The constructed Bioseq is special in that its Seq-hist.assembly shows how a high level view was constructed from other pieces. The sequence in a constructed Bioseq is only posited to exist. However, since it is constructed from data by possibly many different laboratories, it may never have been sequenced in its entirety from a single biological source.

Seq-inst: Typical or Consensus Bioseq

A consensus (consen) Bioseq is used to represent a pattern typical of a sequence region or family of sequences. There is no assertion that even one sequence exists that is exactly like this one, or even that the Bioseq is a best guess at what a real sequence region looks like. Instead it summarizes attributes of an aligned collection of real sequences. It could be a "typical" ferredoxin made by aligning ferredoxin sequences from many organisms and producing a protein sequence which is by some measure "central" to the group. By using the NCBIpaa encoding for the protein, which permits a probability to be assigned to each position that any of the standard amino acids occurs there, one can create a "weight matrix" or "profile" to define the sequence.

While a consensus Bioseq can represent a frequency profile (including the probability that any amino acid can occur at a position, a type of gap penalty), it cannot represent a regular expression per se. That is because all Bioseqs represent fixed integer coordinate systems. This property is essential for attaching feature tables or expressing alignments. There is no clear way to attach a fixed coordinate system to a regular expression, while one can approximate allowing weighted gaps in specific regions with a frequency profile. Since the consensus Bioseq is like any other, information can be attached to it through a feature table and alignments of the consensus pattern to other Bioseqs can be represented like any other alignment (although it may be computed a special way). Through the alignment, annotated features on the pattern can be related to matched regions of the aligned sequence in a straightforward way.

Seq-hist.assembly can be used in a consensus Bioseq to record the sequence regions used to construct the pattern and their relationships with it. While Seq-hist.assembly for a constructed Bioseq indicates the relationship with Bioseqs which are meant to be superseded by the constructed Bioseq, the consensus Bioseq does not in any way replace the Bioseqs in its Seq-hist.assembly. Rather it is a summary of common features among them, not a "better" or "more complete" version of them.

Seq-inst: Map Bioseqs

A map Bioseq inherits all the properties of a virtual Bioseq. For a consensus genetic map of E.coli, we can posit that the chromosome is DNA, circular, double-stranded, and about 5 million base pairs long. Given this coordinate system, we estimate the positions of genes on it based on genetic evidence. That is, we build a feature table with Gene-ref features on it (explained in more detail in the Feature Table chapter). Thus, a map Bioseq is a virtual Bioseq with a Seq-inst.ext which is a feature table. In this case the feature table is an essential part of instantiating the Bioseq, not simply an annotation on the Bioseq. This is not to say a map Bioseq cannot have a feature table in the usual sense as well. It can. It can also be used in alignments, displays, or by any software that can process or store Bioseqs. This is the great strength of this approach. A genetic or physical map is just another Bioseq and can be stored or analyzed right along with other more typical Bioseqs.

It is understood that within a particular physical or genetic mapping research project more data will have to be present than the map Bioseq can represent. But the same is true for a big sequencing project. The Bioseq is an object for reporting the result of such projects to others in a way that preserves most or all the information of use to workers outside the particular research group. It also preserves enough information to be useful to software tools within the project, such as display tools or analysis tools which were written by others.

A number of attributes of Bioseqs can make such a generic representation more "natural" to a particular research community.  For the E.coli map example, above, no E.coli geneticist thinks of the positions of genes in base pairs (yet). So a Num-ref annotation (see Seq-descr, below) can be attached to the Bioseq, which provides information to convert the internal integer coordinate system of the map Bioseq to "minutes", the floating point numbers from 0.0 to 100.0 that E.coli gene positions are traditionally given in. Seq-loc objects which the Gene-ref features use to indicate their position can represent uncertainty, and thus give some idea of the accuracy of the mapping in a simple way. This representation cannot store order information directly (e.g. B and C are after A and before D, but we don't know the absolute distance and we don't know the relative order of B and C), which would need to be stored in a genetic mapping research database. However, a reasonable enough presentation can be made of this situation using locations and uncertainties to be very useful for a wide variety of purposes. As more sequence and physical map information become available, such uncertainties in gene position, at least for the "typical" chromosome, will gradually be resolved and will then map very will to such a generic model.

A physical map Bioseq has similar strengths and weaknesses as the genetic map Bioseq. It can represent an ordered map (such as an ordered restriction map) very well and easily. For some contig building approaches, ordering information is essential to the process of building the physical map and would have to be stored and processed separately by the map building research group. However, the map Bioseq serves very well as a vehicle for periodic reports of the group's best view of the physical map for consumption by the scientific public. The map Bioseq data structure maps quite well to the figures such groups publish to summarize their work. The map Bioseq is an electronic summary that can be integrated with other data and software tools.

Seq-hist: History of a Seq-inst

Seq-hist is literally the history of the Seq-inst part of a Bioseq. It does not track changes in annotation at all. However, since the coordinate system provided by the Seq-inst is the critical element for tying annotations and alignments done at various times by various people into a single consistent database, this is the most important element to track.

While Seq-hist can use any valid Seq-id, in practice NCBI will use the best available Seq-id in the Seq-hist. For this purpose, the Seq-id most tightly linked to the exact sequence itself is best. See the Seq-id discussion.

Seq-hist.assembly has been mentioned above. It is a SET OF Seq-align which show the relationship of this Bioseq to any older components that might be merged into it. The Bioseqs included in the assembly are those from which this Bioseq was made or is meant to supersede. The Bioseqs in the assembly need not all be from the author, but could come from anywhere. Assembly just sets the Bioseq in context.

Seq-hist.replaces makes an editorial statement using a Seq-hist-rec. As of a certain date, this Bioseq should replace the following Bioseqs. Databases at NCBI interpret this in a very specific way. Seq-ids in Seq-hist.replaces, which are owned by the owner of the Bioseq, are taken from the public view of the database. The author has told us to replace them with this one. If the author does not own some of them, it is taken as advice that the older entries may be obsolete, but they are not removed from the public view.

Seq-hist.replaced-by is a forward pointer. It means this Bioseq was replaced by the following Seq-id(s) on a certain date. In the case described above, that an author tells NCBI that a new Bioseq replaces some of his old ones, not only is the backward pointer (Seq-hist.replaces) provided by the author in the database, but NCBI will update the Seq-hist.replaced-by forward pointer when the old Bioseq is removed from public view. Since such old entries are still available for specific retrieval by the public, if a scientist does have annotation pointing to the old entry, the new entry can be explicitly located. Conversely, the older versions of a Bioseq can easily be located as well. Note that Seq-hist.replaced-by points only one generation forward and Seq-hist.replaces points only one generation back. This makes Bioseqs with a Seq-hist a doubly linked list over its revision history. This is very different from GenBank/EMBL/DDBJ secondary accession numbers, which only indicate "some relationship" between entries. When that relationship happens to be the replacement relationship, they still carry all accession numbers in the secondary accessions, not just the last ones, so reconstructing the entry history is impossible, even in a very general way.

Another fate which may await a Bioseq is that it is completely withdrawn. This is relatively rare but does happen. Seq-hist.deleted can either be set to just TRUE, or the date of the deletion event can be entered (preferred). In the SeqHist C structure, slots for both the deleted boolean and deleted date are present. If the deleted date is present, the ASN.1 will have the Date CHOICE for Seq-hist.deleted, else if the deleted boolean is TRUE the ASN.1 will have the BOOLEAN form.

Seq-data: Encoding the Sequence Data Itself

In the case of a raw or constructed Bioseq, the sequence data itself is stored in Seq-inst.seq-data, which is the data type Seq-data. Seq-data is a CHOICE of different ways of encoding the data, allowing selection of the optimal type for the case in hand. Both nucleic acid and amino acid encoding are given as CHOICEs of Seq-data rather than further subclassing first. But it is still not reasonable to encode a Bioseq of Seq-inst.mol of "aa" using a nucleic acid Seq-data type.

In the C structures all types of Seq-data are stored in ByteStores in Bioseq.seq_data. The encoding is given by the value of Bioseq.seq_data_type. The file objseq.h contains a series of #defines for the values of Bioseq.seq_data_type. These #defines map exactly to the ASN.1 Seq-code-type described below.

The ASN.1 module seqcode.asn and C header objcode.h define tables for recording the allowed values for the various sequence encoding and the ways to display or map between codes. This permits useful information about the allowed encoding to be stored as ASN.1 data and read into a program at runtime. NCBI uses the text file seqcode.prt and the binary version of that, seqcode.val, with its software tools. Some of the data from this file is presented in tables in the following discussion of the different sequence encoding. The "value" is the internal numerical value of a residue in the C code. The "symbol" is a one letter or multi-letter symbol to be used in display to a human. The "name" is a descriptive name for the residue. Other data in seqcode.prt will be discussed in the section on seqcode.asn itself.

IUPACaa: The IUPAC-IUB Encoding of Amino Acids

A set of one letter abbreviations for amino acids were suggested by the IUPAC-IUB Commission on Biochemical Nomenclature, published in J. Biol. Chem. (1968) 243: 3557-3559. It is very widely used in both printed and electronic forms of protein sequence, and many computer programs have been written to analyze data in this form internally (that is the actual ASCII value of the one letter code is used internally). To support such approaches, the IUPACaa encoding represents each amino acid internally as the ASCII value of its external one letter symbol. Note that this symbol is UPPER CASE. One may choose to display the value as lower case to a user for readability, but the data itself must be the UPPER CASE value.

In the NCBI C code implementation, the values are stored one value per byte.

IUPACaa

Value

Symbol

Name

65

A

Alanine

66

B

Asp or Asn

67

C

Cysteine

68

D

Aspartic Acid

69

E

Glutamic Acid

70

F

Phenylalanine

71

G

Glycine

72

H

Histidine

73

I

Isoleucine

74

J

Leu or Ile

75

K

Lysine

76

L

Leucine

77

M

Methionine

78

N

Asparagine

79

O

Pyrrolysine

80

P

Proline

81

Q

Glutamine

82

R

Arginine

83

S

Serine

84

T

Threoine

86

V

Valine

87

W

Tryptophan

88

X

Undetermined or atypical

89

Y

Tyrosine

90

Z

Glu or Gln

NCBIeaa: Extended IUPAC Encoding of Amino Acids

The official IUPAC amino acid code has some limitations. One is the lack of symbols for termination, gap, or selenocysteine. Such extensions to the IUPAC codes are also commonly used by sequence analysis software. NCBI has created such a code which is simply the IUPACaa code above extended with the additional symbols.

In the NCBI C code implementation, the values are stored one value per byte.

NCBIeaa

Value

Symbol

Name

42

*

Termination

45

-

Gap

65

A

Alanine

66

B

Asp or Asn

67

C

Cysteine

68

D

Aspartic Acid

69

E

Glutamic Acid

70

F

Phenylalanine

71

G

Glycine

72

H

Histidine

73

I

Isoleucine

74

J

Leu or Ile

75

K

Lysine

76

L

Leucine

77

M

Methionine

78

N

Asparagine

79

O

Pyrrolysine

80

P

Proline

81

Q

Glutamine

82

R

Arginine

83

S

Serine

84

T

Threoine

85

U

Selenocysteine

86

V

Valine

87

W

Tryptophan

88

X

Undetermined or atypical

89

Y

Tyrosine

90

Z

Glu or Gln

NCBIstdaa: A Simple Sequential Code for Amino Acids

It is often very useful to separate the external symbol for a residue from its internal representation as a data value. For amino acids NCBI has devised a simple continuous set of values that encompasses the set of "standard" amino acids also represented by the NCBIeaa code above. A continuous set of values means that compact arrays can be used in computer software to look up attributes for residues simply and easily by using the value as an index into the array. The only significance of any particular mapping of a value to an amino acid is that zero is used for gap and the official IUPAC amino acids come first in the list. In general, we recommend the use of this encoding for standard amino acid sequences.

In the NCBI C code implementation, the values are stored one value per byte.

NCBIstdaa

Value

Symbol

Name

0

-

Gap          

1

A

Alanine      

2

B

Asp or Asn   

3

C

Cysteine     

4

D

Aspartic Acid

5

E

Glutamic Acid

6

F

Phenylalanine

7

G

Glycine      

8

H

Histidine    

9

I

Isoleucine   

10

K

Lysine       

11

L

Leucine      

12

M

Methionine   

13

N

Asparagine   

14

P

Proline      

15

Q

Glutamine    

16

R

Arginine     

17

S

Serine       

18

T

Threoine     

19

V

Valine       

20

W

Tryptophan   

21

X

Undetermined or atypical

22

Y

Tyrosine     

23

Z

Glu or Gln   

24

U

Selenocysteine

25

*

Termination

26

O

Pyrrolysine

27

J

Leu or Ile

NCBI8aa: An Encoding for Modified Amino Acids

Post-translational modifications can introduce a number of non-standard or modified amino acids into biological molecules. The NCBI8aa code will be used to represent up to 250 possible amino acids by using the remaining coding space in the NCBIstdaa code. That is, for the first 26 values, NCBI8aa will be identical to NCBIstdaa. The remaining 224 values will be used for the most commonly encountered modified amino acids. Only the first 250 values will be used to signify amino acids, leaving values in the range of 250-255 to be used for software control codes. Obviously there are a very large number of possible modified amino acids, especially if one takes protein engineering into account. However, the intent here is to only represent commonly found biological forms. This encoding is not yet available since decisions about what amino acids to include have not all been made yet.

IUPAC3aa: A 3 Letter Display Code for Amino Acids

The IUPAC3aa code uses exactly the same values as NCBIstdaa. The only difference is the symbol is the three letters instead of the one letter code. This code is purely for display. As such it does not appear as a valid CHOICE in Seq-data for encoding actual sequence data. However, it does appear in the seqcode.asn specification and is stored in seqcode.val. The symbols follow the IUPAC-IUB recommendations for three letter codes where possible.

IUPAC3aa

Value

Symbol

Name

0

---

Gap

1

Ala

Alanine

2

Asx

Asp or Asn

3

Cys

Cysteine

4

Asp

Aspartic Acid

5

Glu

Glutamic Acid

6

Phe

Phenylalanine

7

Gly

Glycine

8

His

Histidine

9

Ile

Isoleucine

10

Lys

Lysine

11

Leu

Leucine

12

Met

Methionine

13

Asn

Asparagine

14

Pro

Proline

15

Gln

Glutamine

16

Arg

Arginine

17

Ser

Serine

18

Thr

Threoine

19

Val

Valine

20

Trp

Tryptophan

21

Xxx

Undetermined or atypical

22

Tyr

Tyrosine

23

Glx

Glu or Gln

24

Sec

Selenocysteine

25

Ter

Termination

26

Pyl

Pyrrolysine

27

Xle

Leu or Ile

NCBIpaa: A Profile Style Encoding for Amino Acids

The NCBIpaa encoding is designed to accommodate a frequency profile describing a protein motif or family in a form which is consistent with the sequences in a Bioseq. Each position in the sequence is defined by 30 values. Each of the 30 values represents the probability that a particular amino acid (or gap, termination, etc.) will occur at that position. One can consider each set of 30 values an array. The amino acid for each cell of the 30 value array corresponds to the NCBIstdaa index scheme. This means that currently only the first 26 array elements will ever have a meaningful value. The remaining 4 cells are available for possible future additions to NCBIstdaa. Each cell represents the probability that the amino acid defined by the NCBIstdaa index to that cell will appear at that position in the motif or protein. The probability is encoded as an 8-bit value from 0-255 corresponding to a probability from 0.0 to 1.0 by interpolation.

This type of encoding would presumably never appear except in a Bioseq of type "consensus". In the C code implementation these amino acids are encoded at 30 bytes per amino acid in a simple linear order. That is, the first 30 bytes are the first amino acid, the second 30 the next amino acid, and so on.

IUPACna: The IUPAC-IUB Encoding for Nucleic Acids

Like the IUPACaa codes the IUPACna codes are single letters for nucleic acids and the value is the same as the ASCII value of the recommended IUPAC letter. The IUPAC recommendations for nucleic acid codes also include letters to represent all possible ambiguities at a single position in the sequence except a gap. To make the values non-redundant, U is considered the same as T. Whether a sequence actually contains U or T is easily determined from Seq-inst.mol. Since some software tools are designed to work directly on the ASCII representation of the IUPAC letters, this representation is provided. Note that the ASCII values correspond to the UPPER CASE letters. Using values corresponding to lower case letters in Seq-data is an error. For display to a user, any readable case or font is appropriate.

The C implementation encodes one value for a nucleic acid residue per byte.

IUPACna

Value

Symbol

Name

65

A

Adenine

66

B

G or T or C

67

C

Cytosine

68

D

G or A or T

71

G

Guanine

72

H

A or C or T

75

K

G or T

77

M

A or C

78

N

A or G or C or T

82

R

G or A

83

S

G or C

84

T

Thymine

86

V

G or C or A

87

W

A or T

89

Y

T or C

NCBI4na: A Four Bit Encoding of Nucleic Acids

It is possible to represent the same set of nucleic acid and ambiguities with a four bit code, where one bit corresponds to each possible base and where more than one bit is set to represent ambiguity.  The particular encoding used for NCBI4na is the same as that used on the GenBank Floppy Disk Format. A four bit encoding has several advantages over the direct mapping of the ASCII IUPAC codes. One can represent "no base" as 0000. One can match various ambiguous or unambiguous bases by a simple AND. For example, in NCBI4na 0001=A, 0010=C, 0100=G, 1000=T/U. Adenine (0001) then matches Purine (0101) by the AND method. Finally, it is possible to store the sequence in half the space by storing two bases per byte. This is done both in the ASN.1 encoding and in the NCBI C software implementation. Utility functions (see SeqPort()) allow the developer to ignore the complexities of storage while taking advantage of the greater packing. Since nucleic acid sequences can be very long, this is a real savings.

NCBI4na

Value

Symbol

Name

0

-

Gap

1

A

Adenine

2

C

Cytosine

3

M

A or C

4

G

Guanine

5

R

G or A

6

S

G or C

7

V

G or C or A

8

T

Thymine/Uracil

9

W

A or T

10

Y

T or C

11

H

A or C or T

12

K

G or T

13

D

G or A or T

14

B

G or T or C

15

N

A or G or C or T

NCBI2na: A Two Bit Encoding for Nucleic Acids

If no ambiguous bases are present in a nucleic acid sequence it can be completely encoded using only two bits per base. This allows encoding into ASN.1 or storage in the NCBI C implementation with a four fold savings in space. As with the four bit packing, the NCBI C utility SeqPort() allows the programmer to ignore the complexities introduced by the packing. The two bit encoding selected is the same as that proposed for the GenBank CDROM.

NCBI2na

Value

Symbol

Name

0

A

Adenine

1

C

Cytosine

2

G

Guanine

3

T

Thymine/Uracil

NCBI8na: An Eight Bit Sequential Encoding for Modified Nucleic Acids

The first 16 values of NCBI8na are identical with those of NCBI4na. The remaining possible 234 values will be used for common, biologically occurring modified bases such as those found in tRNAs. This full encoding is still being determined at the time of this writing. Only the first 250 values will be used, leaving values in the range of 250-255 to be used as control codes in software.

NCBIpna: A Frequency Profile Encoding for Nucleic Acids

Frequency profiles have been used to describe motifs and signals in nucleic acids. This can be encoded by using five bytes per sequence position. The first four bytes are used to express the probability that particular bases occur at that position, in the order A, C, G, T as in the NCBI2na encoding. The fifth position encodes the probability that a base occurs there at all. Each byte has a value from 0-255 corresponding to a probability from 0.0-1.0.

The sequence is encoded as a simple linear sequence of bytes where the first five bytes code for the first position, the next five for the second position, and so on. Typically the NCBIpna notation would only be found on a Bioseq of type consensus. However, one can imagine other uses for such an encoding, for example to represent knowledge about low resolution sequence data in an easily computable form.

Tables of Sequence Codes

Various sequence alphabets can be stored in tables of type Seq-code-table, defined in seqcode.asn. An enumerated type, Seq-code-type is used as a key to each table. Each code can be thought of as a square table essentially like those presented above in describing each alphabet. Each "residue" of the code has a numerical one-byte value used to represent that residue both in ASN.1 data and in internal C structures. The information necessary to display the value is given by the "symbol". A symbol can be in a one-letter series (e.g. A,G,C,T) or more than one letter (e.g. Met, Leu, etc.). The symbol gives a human readable representation the corresponds to each numerical residue value. A name, or explanatory string, is also associated with each.

So, the NCBI2na code above would be coded into a Seq-code-table very simply as:

        {                                -- NCBI2na

            code ncbi2na ,

            num 4 ,                               -- continuous 0-3

            one-letter TRUE ,            -- all one letter codes

            table {

                { symbol "A", name "Adenine" },

                { symbol "C", name "Cytosine" },

                { symbol "G", name "Guanine" },

                { symbol "T", name "Thymine/Uracil"}

            } ,                          -- end of table           

            comps {                      -- complements

                3,

                2,

                1,

                0

            }

        } ,

The table has 4 rows (with values 0-3) with one letter symbols. If we wished to represent a code with values which do not start at 0 (such as the IUPAC codes) then we would set the OPTIONAL "start-at" element to the value for the first row in the table.

In the case of nucleic acid codes, the Seq-code-table also has rows for indexes to complement the values represented in the table. In the example above, the complement of 0 ("A") is 3 ("T").

Mapping Between Different Sequence Alphabets

A Seq-map-table provides a mapping from the values of one alphabet to the values of another, very like the way complements are mapped above. A Seq-map-table has two Seq-code-types, one giving the alphabet to map from and the other the alphabet to map to. The Seq-map-table has the same number of rows and the same "start-at" value as the Seq-code-table for the alphabet it maps FROM. This makes the mapping a simple array lookup using the value of a residue of the FROM alphabet and subtracting "start-at". Remember that alphabets are not created equal and mapping from a bigger alphabet to a smaller may result in loss of information.

Data and Tools for Sequence Alphabets

NCBI provides a collection of Seq-code-tables and Seq-map-tables together in a Seq-code-set as part of the software toolbox. The file is called seqcode.prt (text form) or seqcode.val (binary ASN.1 used by the software). The function SeqCodeSetLoad() will check your NCBI configuration file looking for the path to "DATA", then read seqcode.val into memory using SeqCodeSetAsnRead(). A local static pointer to the loaded SeqCodes is kept in the SeqCode module, and thus need not be kept by the caller. Additional functions use the static pointer to provide access to the codes. SeqCodeTableFind() will return the appropriate SeqCodeTablePtr given a valid sequence code, and SeqMapTableFind() will return the appropriate SeqMapTablePtr given a code to map from and a code to map to. The SeqPort functions use these functions to provide a view of a sequence in any requested alphabet by mapping residues on demand.  See the chapter on Writing Sequence Software.

Pubdesc: Publication Describing a Bioseq

A Pubdesc is a data structure used to record how a particular publication described a Bioseq. It contains the citation itself as a Pub-equiv (see the Bibliographic References chapter) so that equivalent forms of the citation (e.g. a MEDLINE uid and a Cit-Art) can all be accommodated in a single data structure. Then a number of additional fields allow a more complete description of what was presented in the publication. These extra fields are generally only filled in for entries produced by the NCBI journal scanning component of GenBank, also known as the Backbone database. This information is not generally available in data from any other database yet.

Pubdesc.name is the name given the sequence in the publication, usually in the figure. Pubdesc.fig gives the figure the Bioseq appeared in so a scientist can locate it in the paper. Pubdesc.num preserves the numbering system used by the author (see Numbering below). Pubdesc.numexc, if TRUE, indicates that a "numbering exception" was found (i.e. the author's numbering did not agree with the number of residues in the sequence). This usually indicates an error in the preparation of the figure. If Pubdesc.poly-a is TRUE, then a poly-A tract was indicated for the Bioseq in the figure, but was not explicitly preserved in the sequence itself (e.g. ...AGAATTTCT (Poly-A) ). Pubdesc.maploc is the map location for this sequence as given by the author in this paper. Pubdesc.seq-raw allows the presentation of the sequence exactly as typed from the figure. This is never used now. Pubdesc.align-group, if present, indicates the Bioseq was presented in a group aligned with other Bioseqs. The align-group value is an arbitrary integer. Other Bioseqs from the same publication which are part of the same alignment will have the same align-group number.

Pubdesc.comment is simply a free text comment associated with this publication. SWISSPROT entries may also have this field filled.

Numbering: Applying a Numbering System to a Bioseq

Internally, locations on Bioseqs are ALWAYS integer offsets in the range 0 to (length - 1). However, it is often helpful to display some other numbering system. The Numbering data structure supports a variety of numbering styles and conventions. In the ASN.1 specification, it is simply a CHOICE of the four possible types. When a Numbering object is supplied as a Seq-descr, then it applies to the complete length of the Bioseq. A Numbering object can also be a feature, in which case it only applies to the interval defined by the feature's location.

Num-cont: A Continuous Integer Numbering System

The most widely used numbering system for sequences is some form of a continuous integer numbering. Num-cont.refnum is the number to assign to the first residue in the Bioseq. If Num-cont.has-zero is TRUE, the numbering system uses zero. When biologists start numbering with a negative number, it is quite common for them to skip zero, going directly from -1 to +1, so the DEFAULT for has-zero is FALSE. This only reflects common usage, not any recommendation in terms of convention. Any useful software tool should support both conventions, since they are both used in the literature. Finally, the most common numbering systems are ascending; however descending numbering systems are encountered from time to time, so Num-cont.ascending would then be set to FALSE.

Num-real: A Real Number Numbering Scheme

Genetic maps may use real numbers as "map units" since they treat the chromosome as a continuous coordinate system, instead of a discrete, integer coordinate system of base pairs. Thus a Bioseq of type "map" which may use an underlying integer coordinate system from 0 to 5 million may be best presented to user in the familiar 0.0 to 100.0 map units. Num-real supports a simply linear equation specifying the relationship:

map units = ( Num-real.a + base_pair_position) + Num-real.b

in this example. Since such numbering systems generally have their own units (e.g. "map units", "centisomes", "centimorgans", etc), Num-real.units provides a string for labeling the display.

Num-enum: An Enumerated Numbering Scheme

Occasionally biologists do not use a continuous numbering system at all. Crystallographers and immunologists, for example, who do extensive studies on one or a few sequences, may name the individual residues in the sequence as they fit them into a theoretical framework. So one might see residues numbered ... "10" "11" "12" "12A" "12B" "12C" "13" "14" ... To accommodate this sort of scheme the "name" of each residue must be explicitly given by a string, since there is no anticipating any convention that may be used. The Num-enum.num gives the number of residue names (which should agree with the number of residues in the Bioseq, in the case of use as a Seq-descr), followed by the names as strings.

Num-ref: Numbering by Reference to Another Bioseq

Two types of references are allowed. The "sources" references are meant to apply the numbering system of constituent Bioseqs to a segmented Bioseq. This is useful for seeing the mapping from the parts to the whole.

The "aligns" reference requires that the Num-ref-aligns alignment be filled in with an alignment of the target Bioseq with one or more pieces of other Bioseqs. The numbering will come from the aligned pieces.

Numbering: C Structures and Utility Functions

A Numbering object is implemented in C simply as a ValNode, where ValNode.choice is given by a series of #defines in objpubd.h and ValNode.ptrvalue is a pointer to the appropriate data structure for the Numbering type.

In sequtil.h (see the Sequence Utilities chapter) a number of functions are defined which convert from internal to display numbering systems and vice versa. These functions make the use of fairly complex numbering systems fairly straightforward.

ASN.1 Specification: seq.asn

--$Revision: 1.2 $

--**********************************************************************

--

--  NCBI Sequence elements

--  by James Ostell, 1990

--

--**********************************************************************

 

NCBI-Sequence DEFINITIONS ::=

BEGIN

 

EXPORTS Bioseq, Seq-annot, Pubdesc, Seq-descr, Numbering, Heterogen;

 

IMPORTS Date, Int-fuzz, Dbtag, Object-id, User-object FROM NCBI-General

        Seq-align FROM NCBI-Seqalign

        Seq-feat FROM NCBI-Seqfeat

        Seq-graph FROM NCBI-Seqres

        Pub-equiv FROM NCBI-Pub

        Org-ref FROM NCBI-Organism

        Seq-id, Seq-loc FROM NCBI-Seqloc

        Link-set FROM NCBI-Access

       GB-block FROM GenBank-General

       PIR-block FROM PIR-General

        EMBL-block FROM EMBL-General

       SP-block FROM SP-General

       PRF-block FROM PRF-General

       PDB-block FROM PDB-General;

 

--*** Sequence ********************************

--*

 

Bioseq ::= SEQUENCE {

    id SET OF Seq-id ,            -- equivalent identifiers

    descr Seq-descr OPTIONAL , -- descriptors

    inst Seq-inst ,            -- the sequence data

    annot SET OF Seq-annot OPTIONAL }

 

--*** Descriptors *****************************

--*

 

Seq-descr ::= SET OF CHOICE {

    mol-type GIBB-mol ,          -- type of molecule

    modif SET OF GIBB-mod ,      -- modifiers

    method GIBB-method ,         -- sequencing method

    name VisibleString ,         -- a name for this sequence

    title VisibleString ,        -- a title for this sequence

    org Org-ref ,                -- if all from one organism

    comment VisibleString ,      -- a more extensive comment

    num Numbering ,              -- a numbering system

    maploc Dbtag ,               -- map location of this sequence

    pir PIR-block ,              -- PIR specific info

    genbank GB-block ,           -- GenBank specific info

    pub Pubdesc ,                -- a reference to the publication

    region VisibleString ,       -- overall region (globin locus)

    user User-object ,           -- user defined object

   sp SP-block ,                -- SWISSPROT specific info

    neighbors Link-set ,         -- neighboring information

    embl EMBL-block ,            -- EMBL specific information

   create-date Date ,           -- date entry first created/released

   update-date Date ,           -- date of last update

   prf PRF-block ,                       -- PRF specific information

   pdb PDB-block ,              -- PDB specific information

   het Heterogen }              -- cofactor, etc associated but not bound

 

GIBB-mol ::= ENUMERATED {       -- type of molecule represented

    unknown (0) ,

    genomic (1) ,

    pre-mRNA (2) ,

    mRNA (3) ,

    rRNA (4) ,

    tRNA (5) ,

    snRNA (6) ,

    scRNA (7) ,

    peptide (8) ,

   other-genetic (9) ,      -- other genetic material

   genomic-mRNA (10) ,      -- reported a mix of genomic and cdna sequence

    other (255) }

   

GIBB-mod ::= ENUMERATED {        -- GenInfo Backbone modifiers

    dna (0) ,

    rna (1) ,

    extrachrom (2) ,

    plasmid (3) ,

    mitochondrial (4) ,

    chloroplast (5) ,

    kinetoplast (6) ,

    cyanelle (7) ,

    synthetic (8) ,

    recombinant (9) ,

    partial (10) ,

    complete (11) ,

    mutagen (12) ,    -- subject of mutagenesis ?

    natmut (13) ,     -- natural mutant ?

    transposon (14) ,

    insertion-seq (15) ,

   no-left (16) ,    -- missing left end (5' for na, NH2 for aa)

   no-right (17) ,   -- missing right end (3' or COOH)

   macronuclear (18) ,

   proviral (19) ,

   est (20) ,        -- expressed sequence tag

    other (255) }

 

GIBB-method ::= ENUMERATED {        -- sequencing methods

    concept-trans (1) ,    -- conceptual translation

    seq-pept (2) ,         -- peptide was sequenced

    both (3) ,             -- concept transl. w/ partial pept. seq.

   seq-pept-overlap (4) , -- sequenced peptide, ordered by overlap

   seq-pept-homol (5) ,   -- sequenced peptide, ordered by homology

   concept-trans-a (6) ,  -- conceptual transl. supplied by author

    other (255) }

   

Numbering ::= CHOICE {           -- any display numbering system

    cont Num-cont ,              -- continuous numbering

    enum Num-enum ,              -- enumerated names for residues

    ref Num-ref ,                -- by reference to another sequence

    real Num-real }              -- supports mapping to a float system

   

Num-cont ::= SEQUENCE {          -- continuous display numbering system

    refnum INTEGER DEFAULT 1,         -- number assigned to first residue

    has-zero BOOLEAN DEFAULT FALSE ,  -- 0 used?

    ascending BOOLEAN DEFAULT TRUE }  -- ascending numbers?

 

Num-enum ::= SEQUENCE {          -- any tags to residues

    num INTEGER ,                        -- number of tags to follow

    names SEQUENCE OF VisibleString }    -- the tags

 

Num-ref ::= SEQUENCE {           -- by reference to other sequences

    type ENUMERATED {            -- type of reference

        not-set (0) ,

        sources (1) ,            -- by segmented or const seq sources

        aligns (2) } ,           -- by alignments given below

    aligns Seq-align OPTIONAL }

 

Num-real ::= SEQUENCE {          -- mapping to floating point system

    a REAL ,                     -- from an integer system used by Bioseq

    b REAL ,                     -- position = (a * int_position) + b

    units VisibleString OPTIONAL }

 

Pubdesc ::= SEQUENCE {              -- how sequence presented in pub

    pub Pub-equiv ,                 -- the citation(s)

    name VisibleString OPTIONAL ,   -- name used in paper

    fig VisibleString OPTIONAL ,    -- figure in paper

    num Numbering OPTIONAL ,        -- numbering from paper

    numexc BOOLEAN OPTIONAL ,       -- numbering problem with paper

    poly-a BOOLEAN OPTIONAL ,       -- poly A tail indicated in figure?

    maploc VisibleString OPTIONAL , -- map location reported in paper

    seq-raw StringStore OPTIONAL ,  -- original sequence from paper

    align-group INTEGER OPTIONAL ,  -- this seq aligned with others in paper

   comment VisibleString OPTIONAL }-- any comment on this pub in context

 

Heterogen ::= VisibleString       -- cofactor, prosthetic group, inibitor, etc

 

--*** Instances of sequences *******************************

--*

 

Seq-inst ::= SEQUENCE {            -- the sequence data itself

    repr ENUMERATED {              -- representation class

        not-set (0) ,              -- empty

        virtual (1) ,              -- no seq data

        raw (2) ,                  -- continuous sequence

        seg (3) ,                  -- segmented sequence

        const (4) ,                -- constructed sequence

        ref (5) ,                  -- reference to another sequence

        consen (6) ,               -- consensus sequence or pattern

        map (7) ,                  -- ordered map (genetic, restriction)

        other (255) } ,

    mol ENUMERATED {               -- molecule class in living organism

        not-set (0) ,              --   > cdna = rna

        dna (1) ,

        rna (2) ,

        aa (3) ,

        na (4) ,                   -- just a nucleic acid

        other (255) } ,

    length INTEGER OPTIONAL ,      -- length of sequence in residues

    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty

    topology ENUMERATED {          -- topology of molecule

        not-set (0) ,

        linear (1) ,

        circular (2) ,

        tandem (3) ,               -- some part of tandem repeat

        other (255) } DEFAULT linear ,

    strand ENUMERATED {            -- strandedness in living organism

        not-set (0) ,

        ss (1) ,                   -- single strand

        ds (2) ,                   -- double strand

        mixed (3) ,

        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept

    seq-data Seq-data OPTIONAL ,   -- the sequence

    ext Seq-ext OPTIONAL ,         -- extensions for special types

   hist Seq-hist OPTIONAL }       -- sequence history

 

--*** Sequence Extensions **********************************

--*  for representing more complex types

--*  const type uses Seq-hist.assembly

 

Seq-ext ::= CHOICE {

    seg Seg-ext ,        -- segmented sequences

    ref Ref-ext ,        -- hot link to another sequence (a view)

    map Map-ext }        -- ordered map of markers

 

Seg-ext ::= SEQUENCE OF Seq-loc

 

Ref-ext ::= Seq-loc

 

Map-ext ::= SEQUENCE OF Seq-feat

 

--*** Sequence History Record ***********************************

--** assembly = records how seq was assembled from others

--** replaces = records sequences made obsolete by this one

--** replaced-by = this seq is made obsolete by another(s)

 

Seq-hist ::= SEQUENCE {

   assembly SET OF Seq-align OPTIONAL ,-- how was this assembled?

   replaces Seq-hist-rec OPTIONAL ,    -- seq makes these seqs obsolete

   replaced-by Seq-hist-rec OPTIONAL , -- these seqs make this one obsolete

   deleted CHOICE {

       bool BOOLEAN ,

       date Date } OPTIONAL }

 

Seq-hist-rec ::= SEQUENCE {

   date Date OPTIONAL ,

   ids SET OF Seq-id }

  

--*** Various internal sequence representations ************

--*      all are controlled, fixed length forms

 

Seq-data ::= CHOICE {              -- sequence representations

    iupacna IUPACna ,              -- IUPAC 1 letter nuc acid code

    iupacaa IUPACaa ,              -- IUPAC 1 letter amino acid code

    ncbi2na NCBI2na ,              -- 2 bit nucleic acid code

    ncbi4na NCBI4na ,              -- 4 bit nucleic acid code

    ncbi8na NCBI8na ,              -- 8 bit extended nucleic acid code

    ncbipna NCBIpna ,              -- nucleic acid probabilities

    ncbi8aa NCBI8aa ,              -- 8 bit extended amino acid codes

    ncbieaa NCBIeaa ,              -- extended ASCII 1 letter aa codes

    ncbipaa NCBIpaa ,              -- amino acid probabilities

    ncbistdaa NCBIstdaa }          -- consecutive codes for std aas

 

 

IUPACna ::= StringStore       -- IUPAC 1 letter codes, no spaces

IUPACaa ::= StringStore       -- IUPAC 1 letter codes, no spaces

NCBI2na ::= OCTET STRING      -- 00=A, 01=C, 10=G, 11=T

NCBI4na ::= OCTET STRING      -- 1 bit each for agct

                              -- 0001=A, 0010=C, 0100=G, 1000=T/U

                              -- 0101=Purine, 1010=Pyrimidine, etc

NCBI8na ::= OCTET STRING      -- for modified nucleic acids

NCBIpna ::= OCTET STRING      -- 5 octets/base, prob for a,c,g,t,n

                              -- probabilities are coded 0-255 = 0.0-1.0

NCBI8aa ::= OCTET STRING      -- for modified amino acids

NCBIeaa ::= StringStore       -- ASCII extended 1 letter aa codes

                              -- IUPAC codes + U=selenocysteine

NCBIpaa ::= OCTET STRING      -- 25 octets/aa, prob for IUPAC aas in order:

                              -- A-Y,B,Z,X,(ter),anything

                              -- probabilities are coded 0-255 = 0.0-1.0

NCBIstdaa ::= OCTET STRING    -- codes 0-25, 1 per byte

 

--*** Sequence Annotation *************************************

--*

 

Seq-annot ::= SEQUENCE {

    id Object-id OPTIONAL ,

    db Dbtag OPTIONAL ,

    name VisibleString OPTIONAL ,

    desc VisibleString OPTIONAL ,

    data CHOICE {

        ftable SET OF Seq-feat ,

        align SET OF Seq-align ,

        graph SET OF Seq-graph } }

 

END

ASN.1 Specification: seqblock.asn

--$Revision: 1.2 $

--*********************************************************************

--

--  EMBL specific data

--  This block of specifications was developed by Reiner Fuchs of EMBL

--

--*********************************************************************

 

EMBL-General DEFINITIONS ::=

BEGIN

 

EXPORTS EMBL-dbname, EMBL-xref, EMBL-block;

 

IMPORTS Date, Object-id FROM NCBI-General;

 

EMBL-dbname ::= CHOICE {

   code ENUMERATED {

       embl(0),

       genbank(1),

       ddbj(2),

       geninfo(3),

       medline(4),

      swissprot(5),

       pir(6),

       pdb(7),

       epd(8),

       ecd(9),

       tfd(10),

       flybase(11),

       prosite(12),

       enzyme(13),

       mim(14),

       ecoseq(15),

       hiv(16) },

   name   VisibleString }

 

EMBL-xref ::= SEQUENCE {

   dbname EMBL-dbname,

   id SEQUENCE OF Object-id }

 

EMBL-block ::= SEQUENCE {

   class ENUMERATED {

       not-set(0),

       standard(1),

       unannotated(2),

       other(255) } DEFAULT standard,

   div ENUMERATED {

       fun(0),

       inv(1),

       mam(2),

       org(3),

       phg(4),

       pln(5),

       pri(6),

       pro(7),

       rod(8),

       syn(9),

       una(10),

       vrl(11),

       vrt(12) } OPTIONAL,

   creation-date Date,

   update-date Date,

   extra-acc SEQUENCE OF VisibleString OPTIONAL,

   keywords SEQUENCE OF VisibleString OPTIONAL,

   xref SEQUENCE OF EMBL-xref OPTIONAL }

 

END

 

--*********************************************************************

--

--  SWISSPROT specific data

--  This block of specifications was developed by Mark Cavanaugh of

--     NCBI working with Amos Bairoch of SWISSPROT

--

--*********************************************************************

 

SP-General DEFINITIONS ::=

BEGIN

 

EXPORTS SP-block;

 

IMPORTS Date, Dbtag FROM NCBI-General

       Seq-id FROM NCBI-SeqLoc;