NCBI C++ ToolKit
user_agent.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 #ifndef CGI___USER_AGENT__HPP
00002 #define CGI___USER_AGENT__HPP
00003 
00004 /*  $Id: user_agent.hpp 67263 2015-05-04 14:38:31Z ivanov $
00005  * ===========================================================================
00006  *
00007  *                            PUBLIC DOMAIN NOTICE
00008  *               National Center for Biotechnology Information
00009  *
00010  *  This software/database is a "United States Government Work" under the
00011  *  terms of the United States Copyright Act.  It was written as part of
00012  *  the author's official duties as a United States Government employee and
00013  *  thus cannot be copyrighted.  This software/database is freely available
00014  *  to the public for use. The National Library of Medicine and the U.S.
00015  *  Government have not placed any restriction on its use or reproduction.
00016  *
00017  *  Although all reasonable efforts have been taken to ensure the accuracy
00018  *  and reliability of the software and data, the NLM and the U.S.
00019  *  Government do not and cannot warrant the performance or results that
00020  *  may be obtained by using this software or data. The NLM and the U.S.
00021  *  Government disclaim all warranties, express or implied, including
00022  *  warranties of performance, merchantability or fitness for any particular
00023  *  purpose.
00024  *
00025  *  Please cite the author in any work or product based on this material.
00026  *
00027  * ===========================================================================
00028  *
00029  * Authors: Vladimir Ivanov
00030  *
00031  */
00032 
00033 /// @file user_agent.hpp
00034 /// API to parse user agent strings.
00035 ///
00036 
00037 #include <corelib/version.hpp>
00038 
00039 /** @addtogroup CGI
00040  *
00041  * @{
00042  */
00043 
00044 BEGIN_NCBI_SCOPE
00045 
00046 
00047 /// User agent version info
00048 typedef CVersionInfo TUserAgentVersion;
00049 
00050 
00051 /////////////////////////////////////////////////////////////////////////////
00052 ///
00053 /// CCgiUserAgent --
00054 ///
00055 /// Define class to parse user agent strings.
00056 /// Basically, support only Mozilla 'compatible' format.
00057 ///
00058 class NCBI_XCGI_EXPORT CCgiUserAgent
00059 {
00060 public:
00061     /// Comparison and parsing flags.
00062     enum EFlags {
00063         /// Case insensitive compare, by default it is case sensitive
00064         fNoCase            = (1 << 1),
00065         /// Use external pattern list from registry/environment to check
00066         /// on bots, off by default
00067         fUseBotPatterns    = (1 << 2),
00068         /// Use external pattern lists from registry/environment to check
00069         /// on phone/tablet/mobile device when parsing, off by default
00070         fUseDevicePatterns = (1 << 3)
00071     };
00072     typedef unsigned int TFlags;  ///< Binary OR of "EFlags"
00073 
00074     /// Default constructor.
00075     /// Parse environment variable HTTP_USER_AGENT.
00076     CCgiUserAgent(TFlags flags = 0);
00077 
00078     /// Constructor.
00079     /// Parse the user agent string passed into the constructor.
00080     /// @note
00081     ///   Some registry/environment parameters can affect user agent parsing.
00082     ///   All such features are off by default for better performance, see EFlags.
00083     ///   But they will be used regardless of specified flags in
00084     ///   IsBot/IsMobileDevice/IsTabletDevice methods.
00085     CCgiUserAgent(const string& user_agent, TFlags flags = 0);
00086 
00087     /// Parse new user agent string.
00088     void Reset(const string& user_agent);
00089 
00090     /// Browser types.
00091     enum EBrowser {
00092         eUnknown = 0,           ///< Unknown user agent
00093 
00094         eIE,                    ///< Microsoft Internet Explorer (www.microsoft.com/windows/ie)
00095         eEdge,                  ///< Microsoft Edge (www.microsoft.com)
00096         eiCab,                  ///< iCab       (www.icab.de)
00097         eLynx,                  ///< Lynx       (lynx.browser.org)
00098         eNetscape,              ///< Netscape (Navigator), versions >=6 are Gecko-based (www.netscape.com)
00099         eOpera,                 ///< Opera      (www.opera.com)
00100         eOregano,               ///< Oregano    (www.castle.org.uk/oregano/)
00101         eW3m,                   ///< w3m        (www.w3m.org)
00102         eNagios,                ///< check_http/nagios-plugins (nagiosplugins.org)
00103 
00104         // Gecko-based browsers
00105         eBeonex,                ///< Beonex Communicator (www.beonex.com)
00106         eCamino,                ///< Camino     (www.caminobrowser.org)
00107         eChimera,               ///< Chimera    (chimera.mozdev.org)
00108         eFirefox,               ///< Firefox    (www.mozilla.org/products/firefox)
00109         eFlock,                 ///< Flock      (www.flock.com)
00110         eIceCat,                ///< GNU IceCat (http://www.gnu.org/software/gnuzilla)
00111         eIceweasel,             ///< Debian Iceweasel   (www.geticeweasel.org)
00112         eGaleon,                ///< Galeon     (galeon.sourceforge.net)
00113         eGranParadiso,          ///< GranParadiso (www.mozilla.org)
00114         eKazehakase,            ///< Kazehakase (kazehakase.sourceforge.jp)
00115         eKMeleon,               ///< K-Meleon   (kmeleon.sf.net)
00116         eKNinja,                ///< K-Ninja Samurai (k-ninja-samurai.en.softonic.com)
00117         eMadfox,                ///< Madfox     (www.splyb.com/madfox)
00118         eMultiZilla,            ///< MultiZilla (multizilla.mozdev.org)
00119         eSeaMonkey,             ///< SeaMonkey  (www.mozilla.org/projects/seamonkey)
00120 
00121         // IE-based
00122         eAcooBrowser,           ///< Acoo Browser   (www.acoobrowser.com)
00123         eAOL,                   ///< America Online Browser (www.aol.com)
00124         eAvantBrowser,          ///< Avant Browser  (www.avantbrowser.com)
00125         eCrazyBrowser,          ///< Crazy Browser  (www.crazybrowser.com)
00126         eEnigmaBrowser,         ///< Enigma Browser (www.suttondesigns.com)
00127         eIRider,                ///< iRider         (www.irider.com)
00128         eMaxthon,               ///< Maxthon/MyIE2  (www.maxthon.com)
00129         eNetCaptor,             ///< NetCaptor      (www.netcaptor.com)
00130 
00131         // Blink/WebKit/KHTML based
00132         eChrome,                ///< Google Chrome  (www.google.com/chrome)
00133         eFluid,                 ///< Fluid       (fluidapp.com)
00134         eKonqueror,             ///< Konqueror  (www.konqueror.org) (KHTML based since v3.2 ?)
00135         eMidori,                ///< Midori
00136         eNetNewsWire,           ///< NetNewsWire (www.apple.com)
00137         eOmniWeb,               ///< OmniWeb     (www.omnigroup.com/applications/omniweb)
00138         eQtWeb,                 ///< QtWeb       (www.qtweb.net)
00139         eSafari,                ///< Safari      (www.apple.com/safari)
00140         eShiira,                ///< Shiira      (hmdt-web.net/shiira/en)
00141         eStainless,             ///< Stainless   (www.stainlessapp.com)
00142 
00143         /// Search robots/bots/validators
00144         eCrawler,               ///< Class: crawlers / search robots
00145         eOfflineBrowser,        ///< Class: offline browsers
00146         eScript,                ///< Class: script tools (perl/php/...)
00147         eLinkChecker,           ///< Class: link checkers
00148         eWebValidator,          ///< Class: validators
00149 
00150         /// Mobile devices (browsers and services for: telephones, smartphones, tablets and etc)
00151         /// Some mobile devices use standard browsers, like Opera or Safari -- see browser platform,
00152         /// if you need a check on mobile device.
00153 
00154         // See: http://www.zytrax.com/tech/web/mobile_ids.html
00155 
00156         eAirEdge,               ///< AIR-EDGE     (www.willcom-inc.com/en/)
00157         eAvantGo,               ///< AvantGo      (www.sybase.com/avantgo)
00158         eBlackberry,            ///< Blackberry   (www.blackberry.com)
00159         eDoCoMo,                ///< DoCoMo       (www.nttdocomo.com)
00160         eEudoraWeb,             ///< EudoraWeb    (www.eudora.com)
00161         eMinimo,                ///< Minimo       (www.mozilla.org/projects/minimo)
00162         eNetFront,              ///< NetFront     (www.access-company.com)
00163         eOperaMini,             ///< Opera Mini   (www.opera.com/mini)
00164         eOperaMobile,           ///< Opera Mobile (www.opera.com/mobile)
00165         eOpenWave,              ///< OpenWave/UP.Browser (www.openwave.com)
00166         ePIE,                   ///< Pocket IE    (www.reensoft.com/PIEPlus)
00167         ePlucker,               ///< Plucker      (www.plkr.org)
00168         ePocketLink,            ///< PocketLink   (www.mobilefan.net)
00169         ePolaris,               ///< Polaris Browser (www.infraware.co.kr)
00170         eReqwireless,           ///< Reqwireless Webviewer
00171         eSafariMobile,          ///< Mobile Safari (www.apple.com/safari)
00172         eSEMCBrowser,           ///< Sony Ericsson SEMC-Browser (www.sonyericsson.com)
00173         eTelecaObigo,           ///< Teleca/Obigo  (www.teleca.com / www.obigo.com)
00174         euZardWeb,              ///< uZard Web     (www.uzard.com)
00175         eVodafone,              ///< Ex J-Phone, now Vodafone Live! (www.vodafone.com)
00176         eXiino,                 ///< Xiino        (www.ilinx.co.jp/en/)
00177 
00178         /// Any other Gecko-based not from the list above,
00179         /// Mozilla version >= 5.0
00180         eMozilla,                ///< Mozilla/other Gecko-based (www.mozilla.com)
00181 
00182         /// Any other not from list above.
00183         /// User agent string starts with "Mozilla/x.x (compatible;*".
00184         /// Not Gecko-based.
00185         eMozillaCompatible      ///< Mozilla-compatible
00186     };
00187 
00188     /// Browser engine types.
00189     enum EBrowserEngine {
00190         eEngine_Unknown = eUnknown,     ///< Unknown engine
00191         eEngine_IE      = eIE,          ///< Microsoft Internet Explorer (Trident and etc)
00192         eEngine_Edge    = eEdge,        ///< Microsoft Edge
00193         eEngine_Gecko   = eMozilla,     ///< Gecko-based
00194         eEngine_KHTML   = eKonqueror,   ///< KHTML-based
00195         eEngine_WebKit  = eSafari,      ///< Apple WebKit (KHTML fork)
00196         eEngine_Blink   = eChrome,      ///< Google Blink (WebKit/537.36 fork)
00197         eEngine_Bot     = eCrawler      ///< Search robot/bot/checker/...
00198     };
00199 
00200     /// Platform types
00201     enum EBrowserPlatform {
00202         ePlatform_Unknown = eUnknown,   ///< Unknown OS
00203         ePlatform_Windows,              ///< Microsoft Windows
00204         ePlatform_Mac,                  ///< MacOS
00205         ePlatform_Unix,                 ///< Unix
00206 
00207         // Mobile devices (telephones, smart phones, tablets and etc...)
00208         ePlatform_Android,              ///< Android
00209         ePlatform_Palm,                 ///< PalmOS
00210         ePlatform_Symbian,              ///< SymbianOS
00211         ePlatform_WindowsCE,            ///< Microsoft Windows CE (+ Windows Mobile)
00212         ePlatform_MobileDevice          ///< Other mobile devices or services
00213     };
00214 
00215     /// Get user agent string.
00216     string GetUserAgentStr(void) const
00217         { return m_UserAgent; }
00218 
00219     /// Get browser type.
00220     EBrowser GetBrowser(void) const
00221         { return m_Browser; }
00222 
00223     /// Get browser name.
00224     ///
00225     /// @return
00226     ///   Browser name or empty string for unknown browser
00227     /// @sa GetBrowser
00228     const string& GetBrowserName(void) const
00229         { return m_BrowserName; }
00230 
00231     /// Get browser engine type and name.
00232     /// @sa EBrowserEngine 
00233     EBrowserEngine GetEngine(void) const 
00234         { return m_Engine; }
00235     string GetEngineName(void) const;
00236 
00237     /// Get platform (OS) type and name.
00238     /// @sa EPlatform
00239     EBrowserPlatform GetPlatform(void) const 
00240         { return m_Platform; }
00241     string GetPlatformName(void) const;
00242 
00243     /// Get browser version information.
00244     ///
00245     /// If version field (major, minor, patch level) equal -1 that
00246     /// it is not defined.
00247     const TUserAgentVersion& GetBrowserVersion(void) const
00248         { return m_BrowserVersion; }
00249     const TUserAgentVersion& GetEngineVersion(void) const
00250         { return m_EngineVersion; }
00251     const TUserAgentVersion& GetMozillaVersion(void) const
00252         { return m_MozillaVersion; }
00253 
00254 
00255     /// Bots check flags (what consider to be a bot).
00256     /// @sa EBrowser, EBrowserEngine
00257     enum EBotFlags {
00258         fBotCrawler         = (1<<1), 
00259         fBotOfflineBrowser  = (1<<2), 
00260         fBotScript          = (1<<3), 
00261         fBotLinkChecker     = (1<<4), 
00262         fBotWebValidator    = (1<<5), 
00263         fBotAll             = 0xFF
00264     };
00265     typedef unsigned int TBotFlags;    ///< Binary OR of "EBotFlags"
00266 
00267     /// Check that this is known browser.
00268     ///
00269     /// @note
00270     ///   This method can return FALSE for old or unknown browsers,
00271     ///   or browsers for mobile devices. Use it with caution.
00272     /// @sa GetBrowser, GetEngine
00273     bool IsBrowser(void) const;
00274 
00275     /// Check that this is known search robot/bot.
00276     ///
00277     /// By default it use GetEngine() and GetBrowser() value to check on
00278     /// known bots, and only here 'flags' parameter can be used. 
00279     /// @include_patterns
00280     ///   List of additional patterns that can treat current user agent
00281     ///   as bot. If standard check fails, this string and/or 
00282     ///   registry/environment parameters (section 'CGI', name 'Bots') 
00283     ///   will be used. String value should have patterns for search in 
00284     ///   the user agent string, and should looks like:
00285     ///       "Googlebot Scooter WebCrawler Slurp"
00286     ///   You can use any delimiters from next list " ;|~\t".
00287     ///   If you want to use space or any other symbol from the delimiters list
00288     ///   as part of the pattern, then use multi-line pattern. In this case
00289     ///   each line contains a single pattern with any symbol inside it.
00290     ///   All patterns are case sensitive.
00291     ///   For details how to define registry/environment parameter see
00292     ///   CParam description.
00293     /// @exclude_patterns
00294     ///   This parameter and string from (section 'CGI', name 'NotBots') can be
00295     ///   used to remove any user agent signature from list of bots, if you
00296     ///   don't agree with parser's decision. IsBot() will return FALSE if 
00297     ///   the user agent string contains one of these patters.
00298     /// @note
00299     ///   Registry file:
00300     ///       [CGI]
00301     ///       Bots = ...
00302     ///       NotBots = ...
00303     ///   Environment variables:
00304     ///       NCBI_CONFIG__CGI__Bots  = ...
00305     ///       NCBI_CONFIG__CGI__NotBots  = ...
00306     /// @sa 
00307     ///   GetBrowser, GetEngine, CParam
00308     bool IsBot(TBotFlags flags = fBotAll,
00309                const string& include_patterns = kEmptyStr,
00310                const string& exclude_patterns = kEmptyStr) const;
00311 
00312     /// Flags to check device type.
00313     /// Zero value mean unknown device type, usually considered as desktop.
00314     /// @sa GetDeviceType, IsMobileDevice, IsTableDevice
00315     enum EDeviceFlags {
00316         fDevice_Phone        = (1<<1),   ///< Phone / not known tablet / mobile browser on desktop
00317         fDevice_Tablet       = (1<<2),   ///< Known tablet
00318         fDevice_Mobile       = fDevice_Phone | fDevice_Tablet
00319     };
00320     typedef unsigned int TDeviceFlags;    ///< Binary OR of "EDeviceFlags"
00321 
00322     /// Get device type.
00323     ///
00324     /// Use this method with caution, because it is impossible to detect
00325     /// resolution or form-factor of the device based on user agent string only.
00326     /// We can only make an assumptions here.
00327     /// @return
00328     ///   Bit mask with categories of devices that can have such user agent string.
00329     ///   Zero value mean unknown device type, usually considered as desktop.
00330     /// @note
00331     ///   Some registry/environment parameters can affect user agent parsing.
00332     ///   See IsPhoneDevice(), IsMobileDevice() and IsTabletDevice() for details.
00333     /// @sa IsMobileDevice, IsTableDevice
00334     TDeviceFlags GetDeviceType(void) const
00335         { return m_DeviceFlags; }
00336 
00337 
00338     /// Check that this is a known phone-size device.
00339     ///
00340     /// Use this method with caution, because it is impossible to detect
00341     /// resolution and form-factor of the device based on user agent string only,
00342     /// we can only make an assumptions here. Also, we cannot say can some
00343     /// device make calls or not.
00344     /// @include_patterns
00345     ///   List of additional patterns that can treat current user agent
00346     ///   as phone-size device if standard check fails, this string and/or
00347     ///   registry/environment parameter (section 'CGI', name 'PhoneDevices')
00348     ///   will be used. String value should have patterns for search in 
00349     ///   the user agent string, and should looks like: "Name1 Name2 Name3".
00350     ///   You can use any delimiters from next list " ;|~\t".
00351     ///   If you want to use space or any other symbol from the delimiters list
00352     ///   as part of the pattern, then use multi-line pattern. In this case
00353     ///   each line contains a single pattern with any symbol inside it.
00354     ///   All patterns are case sensitive by default unless fNoCase flag is specified.
00355     /// @exclude_patterns
00356     ///   This parameter and string from (section 'CGI', name 'NotPhoneDevices')
00357     ///   can be used to remove any user agent signature from list of phone-size
00358     ///   devices, if you don't agree with parser's decision. IsPhoneDevice()
00359     ///   will return FALSE if the user agent string contains one of these patters.
00360     /// @return
00361     ///    Return TRUE for all user agent string that have known signatures of
00362     ///    phone-size devices. We can detect only limited number of such devices,
00363     ///    so be aware.
00364     /// @note
00365     ///   Registry file:
00366     ///       [CGI]
00367     ///       PhoneDevices = ...
00368     ///       NotPhoneDevices = ...
00369     ///   Environment variables:
00370     ///       NCBI_CONFIG__CGI__PhoneDevices = ...
00371     ///       NCBI_CONFIG__CGI__NotPhoneDevices = ...
00372     /// @sa 
00373     ///   GetDeviceType, GetPlatform, EBrowserPlatform, CParam, IsTabletDevice, IsMobileDevice
00374     bool IsPhoneDevice(const string& include_patterns = kEmptyStr,
00375                        const string& exclude_patterns = kEmptyStr) const;
00376 
00377     /// Check that this is a known tablet device.
00378     ///
00379     /// Use this method with caution, because it is impossible to detect
00380     /// resolution or form-factor of the device based on user agent string only,
00381     /// we can only make an assumptions here.
00382     /// @include_patterns
00383     ///   List of additional patterns that can treat current user agent
00384     ///   as tablet device if standard check fails, this string and/or
00385     ///   registry/environment parameter (section 'CGI', name 'TabletDevices')
00386     ///   will be used. String value should have patterns for search in 
00387     ///   the user agent string, and should looks like: "Name1 Name2 Name3".
00388     ///   You can use any delimiters from next list " ;|~\t".
00389     ///   If you want to use space or any other symbol from the delimiters list
00390     ///   as part of the pattern, then use multi-line pattern. In this case
00391     ///   each line contains a single pattern with any symbol inside it.
00392     ///   All patterns are case sensitive by default unless fNoCase flag is specified.
00393     /// @exclude_patterns
00394     ///   This parameter and string from (section 'CGI', name 'NotTabletDevices')
00395     ///   can be used to remove any user agent signature from list of tablet
00396     ///   devices, if you don't agree with parser's decision. IsTabletDevice()
00397     ///   will return FALSE if the user agent string contains one of these patters.
00398     /// @note
00399     ///   Registry file:
00400     ///       [CGI]
00401     ///       TabletDevices = ...
00402     ///       NotTabletDevices = ...
00403     ///   Environment variables:
00404     ///       NCBI_CONFIG__CGI__TabletDevices = ...
00405     ///       NCBI_CONFIG__CGI__TabletDevices = ...
00406     /// @return
00407     ///    Return TRUE for devices that can be detected as tablets.
00408     ///    Usually, IsMobileDevice() also return TRUE for the same
00409     ///    user agent string. Not all devices can be detected as tablets,
00410     ///    only few combinations of new versions of browsers and OS provide
00411     ///    such informations in the UA-string, and limited number of device
00412     ///    names can be used for such detection.
00413     /// @sa 
00414     ///   GetDeviceType, CParam, IsMobileDevice, IsPhoneDevice
00415     bool IsTabletDevice(const string& include_patterns = kEmptyStr,
00416                         const string& exclude_patterns = kEmptyStr) const;
00417 
00418     /// Check that this is a known mobile device.
00419     ///
00420     /// Use this method with caution, because it is impossible to detect
00421     /// resolution or form-factor of the device based on user agent string only,
00422     /// we can only make an assumptions here.
00423     /// @include_patterns
00424     ///   List of additional patterns that can treat current user agent
00425     ///   as mobile device if standard check fails, this string and/or
00426     ///   registry/environment parameter (section 'CGI', name 'MobileDevices')
00427     ///   will be used. String value should have patterns for search in 
00428     ///   the user agent string, and should looks like: "Name1 Name2 Name3".
00429     ///   You can use any delimiters from next list " ;|~\t".
00430     ///   If you want to use space or any other symbol from the delimiters list
00431     ///   as part of the pattern, then use multi-line pattern. In this case
00432     ///   each line contains a single pattern with any symbol inside it.
00433     ///   All patterns are case sensitive by default unless fNoCase flag is specified.
00434     /// @exclude_patterns
00435     ///   This parameter and string from (section 'CGI', name 'NotMobileDevices')
00436     ///   can be used to remove any user agent signature from list of mobile
00437     ///   devices, if you don't agree with parser's decision. IsMobileDevice()
00438     ///   will return FALSE if the user agent string contains one of these patters.
00439     /// @return
00440     ///    Return TRUE for all devices with user agent strings that use mobile
00441     ///    version of browser or have any known mobile platform signatures.
00442     ///    The device can be a phone or tablet, or any other device with
00443     ///    the same keywords in the UA-string.
00444     /// @note
00445     ///   Registry file:
00446     ///       [CGI]
00447     ///       MobileDevices = ...
00448     ///       NotMobileDevices = ...
00449     ///   Environment variables:
00450     ///       NCBI_CONFIG__CGI__MobileDevices = ...
00451     ///       NCBI_CONFIG__CGI__NotMobileDevices = ...
00452     /// @sa 
00453     ///   GetDeviceType, GetPlatform, EBrowserPlatform, CParam, IsPhoneDevice, IsTabletDevice
00454     bool IsMobileDevice(const string& include_patterns = kEmptyStr,
00455                         const string& exclude_patterns = kEmptyStr) const;
00456 
00457 protected:
00458     /// Init class members.
00459     void x_Init(void);
00460     /// Parse user agent string.
00461     void x_Parse(const string& user_agent);
00462     /// Parse token with browser name and version.
00463     bool x_ParseToken(const string& token, int where);
00464     /// Helper method to check UA-string against external pattern lists.
00465     bool x_CheckPattern(int what, bool current_status, bool use_patterns,
00466                         const string& include_patterns = kEmptyStr,
00467                         const string& exclude_patterns = kEmptyStr) const;
00468 protected:
00469     string            m_UserAgent;      ///< User-Agent string
00470     TFlags            m_Flags;          ///< Comparison and parsing flags
00471     EBrowser          m_Browser;        ///< Browser type
00472     string            m_BrowserName;    ///< Browser name
00473     TUserAgentVersion m_BrowserVersion; ///< Browser version info
00474     EBrowserEngine    m_Engine;         ///< Browser engine type
00475     TUserAgentVersion m_EngineVersion;  ///< Browser engine version
00476     TUserAgentVersion m_MozillaVersion; ///< Browser mozilla version
00477     EBrowserPlatform  m_Platform;       ///< Platform type
00478     TDeviceFlags      m_DeviceFlags;    ///< Device type flags
00479 };
00480 
00481 
00482 END_NCBI_SCOPE
00483 
00484 #endif  /* CGI___USER_AGENT__HPP */
Modified on Mon May 25 11:32:34 2015 by modify_doxy.py rev. 426318