Improve file detection with signature check capabilities

This allows more complex detection upon regex rules for a certain amount of
lines.
This commit is contained in:
Jöran Karl
2023-05-04 23:48:42 +02:00
parent d8e9d61a95
commit 433879046e
37 changed files with 130 additions and 85 deletions

View File

@@ -5,7 +5,7 @@ filetype: powershell
detect:
filename: "\\.ps(1|m1|d1)$"
#header: ""
#signature: ""
rules:
# - comment.block: # Block Comment

View File

@@ -2,7 +2,7 @@
Here are micro's syntax files.
Each yaml file specifies how to detect the filetype based on file extension or headers (first line of the file).
Each yaml file specifies how to detect the filetype based on file extension or given signature. The signature can be matched to a maximum of 100 lines (to limit parse times) for a best "guess".
Then there are patterns and regions linked to highlight groups which tell micro how to highlight that filetype.
Making your own syntax files is very simple. I recommend you check the file after you are finished with the

View File

@@ -2,7 +2,7 @@ filetype: awk
detect:
filename: "\\.awk$"
header: "^#!.*bin/(env +)?awk( |$)"
signature: "^#!.*bin/(env +)?awk( |$)"
rules:
- preproc: "\\$[A-Za-z0-9_!@#$*?\\-]+"

View File

@@ -2,7 +2,7 @@ filetype: batch
detect:
filename: "(\\.bat$|\\.cmd$)"
# header: ""
# signature: ""
rules:
# Numbers

View File

@@ -2,7 +2,7 @@ filetype: crontab
detect:
filename: "crontab$"
header: "^#.*?/etc/crontab"
signature: "^#.*?/etc/crontab"
rules:
# The time and date fields are:

View File

@@ -1,7 +1,7 @@
filetype: csharp-script
detect:
filename: "\\.csx$"
header: "^#!.*/(env +)?dotnet-script( |$)"
signature: "^#!.*/(env +)?dotnet-script( |$)"
rules:
- include: "csharp"

View File

@@ -2,7 +2,7 @@ filetype: fish
detect:
filename: "\\.fish$"
header: "^#!.*/(env +)?fish( |$)"
signature: "^#!.*/(env +)?fish( |$)"
rules:
# Numbers

View File

@@ -5,7 +5,7 @@ filetype: godoc
detect:
filename: "\\.godoc$"
header: package.*import
signature: package.*import
rules:
- preproc: "^[^ ].*"

View File

@@ -2,7 +2,7 @@ filetype: groovy
detect:
filename: "(\\.(groovy|gy|gvy|gsh|gradle)$|^[Jj]enkinsfile$)"
header: "^#!.*/(env +)?groovy *$"
signature: "^#!.*/(env +)?groovy *$"
rules:
# And the style guide for constants is CONSTANT_CASE

View File

@@ -2,7 +2,7 @@ filetype: html4
detect:
filename: "\\.htm[l]?4$"
header: "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN|http://www.w3.org/TR/html4/strict.dtd\">"
signature: "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN|http://www.w3.org/TR/html4/strict.dtd\">"
rules:
- error: "<[^!].*?>"

View File

@@ -2,7 +2,7 @@ filetype: html5
detect:
filename: "\\.htm[l]?5$"
header: "<!DOCTYPE html5>"
signature: "<!DOCTYPE html5>"
rules:
- error: "<[^!].*?>"

View File

@@ -2,7 +2,7 @@ filetype: javascript
detect:
filename: "(\\.js$|\\.es[5678]?$|\\.mjs$)"
header: "^#!.*/(env +)?node( |$)"
signature: "^#!.*/(env +)?node( |$)"
rules:
- constant.number: "\\b[-+]?([1-9][0-9]*|0[0-7]*|0x[0-9a-fA-F]+)([uU][lL]?|[lL][uU]?)?\\b"

View File

@@ -2,7 +2,7 @@ filetype: json
detect:
filename: "\\.json$"
header: "^\\{$"
signature: "^\\{$"
rules:
- constant.number: "\\b[-+]?([1-9][0-9]*|0[0-7]*|0x[0-9a-fA-F]+)([uU][lL]?|[lL][uU]?)?\\b"

View File

@@ -2,7 +2,7 @@ filetype: julia
detect:
filename: "\\.jl$"
header: "^#!.*/(env +)?julia( |$)"
signature: "^#!.*/(env +)?julia( |$)"
rules:

View File

@@ -3,7 +3,7 @@ filetype: 'justfile'
detect:
filename: "(^\\.?[Jj]ustfile|\\.just)$"
header: "^#!.*/(env +)?[bg]?just --justfile"
signature: "^#!.*/(env +)?[bg]?just --justfile"
rules:
- preproc: "\\<(ifeq|ifdef|ifneq|ifndef|else|endif)\\>"

View File

@@ -2,7 +2,7 @@ filetype: mail
detect:
filename: "(.*/mutt-.*|\\.eml)$"
header: "^From .* \\d+:\\d+:\\d+ \\d+"
signature: "^From .* \\d+:\\d+:\\d+ \\d+"
rules:
- type: "^From .*"

View File

@@ -1,4 +1,5 @@
//+build ignore
//go:build ignore
// +build ignore
package main
@@ -16,15 +17,15 @@ import (
type HeaderYaml struct {
FileType string `yaml:"filetype"`
Detect struct {
FNameRgx string `yaml:"filename"`
HeaderRgx string `yaml:"header"`
FNameRgx string `yaml:"filename"`
SignatureRgx string `yaml:"signature"`
} `yaml:"detect"`
}
type Header struct {
FileType string
FNameRgx string
HeaderRgx string
FileType string
FNameRgx string
SignatureRgx string
}
func main() {
@@ -58,7 +59,7 @@ func encode(name string, c HeaderYaml) {
f, _ := os.Create(name + ".hdr")
f.WriteString(c.FileType + "\n")
f.WriteString(c.Detect.FNameRgx + "\n")
f.WriteString(c.Detect.HeaderRgx + "\n")
f.WriteString(c.Detect.SignatureRgx + "\n")
f.Close()
}
@@ -69,7 +70,7 @@ func decode(name string) Header {
var hdr Header
hdr.FileType = string(strs[0])
hdr.FNameRgx = string(strs[1])
hdr.HeaderRgx = string(strs[2])
hdr.SignatureRgx = string(strs[2])
fmt.Printf("took %v\n", time.Since(start))
return hdr

View File

@@ -2,7 +2,7 @@ filetype: makefile
detect:
filename: "([Mm]akefile|\\.ma?k)$"
header: "^#!.*/(env +)?[bg]?make( |$)"
signature: "^#!.*/(env +)?[bg]?make( |$)"
rules:
- preproc: "\\<(ifeq|ifdef|ifneq|ifndef|else|endif)\\>"

View File

@@ -2,7 +2,7 @@ filetype: nginx
detect:
filename: "nginx.*\\.conf$|\\.nginx$"
header: "^(server|upstream)[a-z ]*\\{$"
signature: "^(server|upstream)[a-z ]*\\{$"
rules:
- preproc: "\\b(events|server|http|location|upstream)[[:space:]]*\\{"

View File

@@ -2,7 +2,7 @@ filetype: patch
detect:
filename: "\\.(patch|diff)$"
header: "^diff"
signature: "^diff"
rules:
- brightgreen: "^\\+.*"

View File

@@ -2,7 +2,7 @@ filetype: perl
detect:
filename: "\\.p[lmp]$"
header: "^#!.*/(env +)?perl( |$)"
signature: "^#!.*/(env +)?perl( |$)"
rules:
- type: "\\b(accept|alarm|atan2|bin(d|mode)|c(aller|homp|h(dir|mod|op|own|root)|lose(dir)?|onnect|os|rypt)|d(bm(close|open)|efined|elete|ie|o|ump)|e(ach|of|val|x(ec|ists|it|p))|f(cntl|ileno|lock|ork))\\b|\\b(get(c|login|peername|pgrp|ppid|priority|pwnam|(host|net|proto|serv)byname|pwuid|grgid|(host|net)byaddr|protobynumber|servbyport)|([gs]et|end)(pw|gr|host|net|proto|serv)ent|getsock(name|opt)|gmtime|goto|grep|hex|index|int|ioctl|join)\\b|\\b(keys|kill|last|length|link|listen|local(time)?|log|lstat|m|mkdir|msg(ctl|get|snd|rcv)|next|oct|open(dir)?|ord|pack|pipe|pop|printf?|push|q|qq|qx|rand|re(ad(dir|link)?|cv|say|do|name|quire|set|turn|verse|winddir)|rindex|rmdir|s|scalar|seek(dir)?)\\b|\\b(se(lect|mctl|mget|mop|nd|tpgrp|tpriority|tsockopt)|shift|shm(ctl|get|read|write)|shutdown|sin|sleep|socket(pair)?|sort|spli(ce|t)|sprintf|sqrt|srand|stat|study|substr|symlink|sys(call|read|tem|write)|tell(dir)?|time|tr(y)?|truncate|umask)\\b|\\b(un(def|link|pack|shift)|utime|values|vec|wait(pid)?|wantarray|warn|write)\\b"

View File

@@ -2,7 +2,7 @@ filetype: python2
detect:
filename: "\\.py2$"
header: "^#!.*/(env +)?python2$"
signature: "^#!.*/(env +)?python2$"
rules:

View File

@@ -2,7 +2,7 @@ filetype: python
detect:
filename: "\\.py(3)?$"
header: "^#!.*/(env +)?python(3)?$"
signature: "^#!.*/(env +)?python(3)?$"
rules:
# built-in objects

View File

@@ -2,7 +2,7 @@ filetype: ruby
detect:
filename: "\\.(rb|rake|gemspec)$|^(.*[\\/])?(Gemfile|config.ru|Rakefile|Capfile|Vagrantfile|Guardfile|Appfile|Fastfile|Pluginfile|Podfile|\\.?[Bb]rewfile)$"
header: "^#!.*/(env +)?ruby( |$)"
signature: "^#!.*/(env +)?ruby( |$)"
rules:
- comment.bright:

View File

@@ -2,7 +2,7 @@ filetype: sage
detect:
filename: "\\.sage$"
header: "^#!.*/(env +)?sage( |$)"
signature: "^#!.*/(env +)?sage( |$)"
rules:

View File

@@ -2,7 +2,7 @@ filetype: sed
detect:
filename: "\\.sed$"
header: "^#!.*bin/(env +)?sed( |$)"
signature: "^#!.*bin/(env +)?sed( |$)"
rules:
- symbol.operator: "[|^$.*+]"

View File

@@ -24,7 +24,7 @@ filetype: shell
# * bash-fc. (followed by a random string)
detect:
filename: "(\\.(sh|bash|ash|ebuild)$|(\\.bash(rc|_aliases|_functions|_profile)|\\.?profile|Pkgfile|pkgmk\\.conf|rc\\.conf|PKGBUILD|APKBUILD)$|bash-fc\\.)"
header: "^#!.*/(env +)?(ba)?(a)?(mk)?sh( |$)"
signature: "^#!.*/(env +)?(ba)?(a)?(mk)?sh( |$)"
rules:
# Numbers

View File

@@ -137,7 +137,7 @@ func generateFile(filetype, syntax, header string, rules []interface{}) string {
output += fmt.Sprintf("detect: \n filename: \"%s\"\n", strings.Replace(strings.Replace(syntax, "\\", "\\\\", -1), "\"", "\\\"", -1))
if header != "" {
output += fmt.Sprintf(" header: \"%s\"\n", strings.Replace(strings.Replace(header, "\\", "\\\\", -1), "\"", "\\\"", -1))
output += fmt.Sprintf(" signature: \"%s\"\n", strings.Replace(strings.Replace(header, "\\", "\\\\", -1), "\"", "\\\"", -1))
}
output += "\nrules:\n"

View File

@@ -2,7 +2,7 @@ filetype: systemd
detect:
filename: "\\.(service|socket|timer)$"
header: "^\\[Unit\\]$"
signature: "^\\[Unit\\]$"
rules:
- statement: "^(Accept|After|Alias|AllowIsolate|Also|ANSI_COLOR|_AUDIT_LOGINUID|_AUDIT_SESSION|Backlog|Before|BindIPv6Only|BindsTo|BindToDevice|BlockIOReadBandwidth|BlockIOWeight|BlockIOWriteBandwidth|_BOOT_ID|Broadcast|BUG_REPORT_URL|BusName|Capabilities|CapabilityBoundingSet|CHASSIS|cipher|class|_CMDLINE|CODE_FILE|CODE_FUNC|CODE_LINE|_COMM|Compress|ConditionACPower|ConditionCapability|ConditionDirectoryNotEmpty|ConditionFileIsExecutable|ConditionFileNotEmpty|ConditionHost|ConditionKernelCommandLine|ConditionNull|ConditionPathExists|ConditionPathExistsGlob|ConditionPathIsDirectory|ConditionPathIsMountPoint|ConditionPathIsReadWrite|ConditionPathIsSymbolicLink|ConditionSecurity|ConditionVirtualization|Conflicts|ControlGroup|ControlGroupAttribute|ControlGroupModify|ControlGroupPersistent|controllers|Controllers|CPE_NAME|CPUAffinity|CPUSchedulingPolicy|CPUSchedulingPriority|CPUSchedulingResetOnFork|CPUShares|CrashChVT|CrashShell|__CURSOR|debug|DefaultControllers|DefaultDependencies|DefaultLimitAS|DefaultLimitCORE|DefaultLimitCPU|DefaultLimitDATA|DefaultLimitFSIZE|DefaultLimitLOCKS|DefaultLimitMEMLOCK|DefaultLimitMSGQUEUE|DefaultLimitNICE|DefaultLimitNOFILE|DefaultLimitNPROC|DefaultLimitRSS|DefaultLimitRTPRIO|DefaultLimitRTTIME|DefaultLimitSIGPENDING|DefaultLimitSTACK|DefaultStandardError|DefaultStandardOutput|Description|DeviceAllow|DeviceDeny|DirectoryMode|DirectoryNotEmpty|Documentation|DumpCore|entropy|Environment|EnvironmentFile|ERRNO|event_timeout|_EXE|ExecReload|ExecStart|ExecStartPost|ExecStartPre|ExecStop|ExecStopPost|ExecStopPre|filter|FONT|FONT_MAP|FONT_UNIMAP|ForwardToConsole|ForwardToKMsg|ForwardToSyslog|FreeBind|freq|FsckPassNo|fstab|_GID|Group|GuessMainPID|HandleHibernateKey|HandleLidSwitch|HandlePowerKey|HandleSuspendKey|hash|HibernateKeyIgnoreInhibited|HOME_URL|_HOSTNAME|ICON_NAME|ID|IdleAction|IdleActionSec|ID_LIKE|ID_MODEL|ID_MODEL_FROM_DATABASE|IgnoreOnIsolate|IgnoreOnSnapshot|IgnoreSIGPIPE|InaccessibleDirectories|InhibitDelayMaxSec|init|IOSchedulingClass|IOSchedulingPriority|IPTOS|IPTTL|JobTimeoutSec|JoinControllers|KeepAlive|KEYMAP|KEYMAP_TOGGLE|KillExcludeUsers|KillMode|KillOnlyUsers|KillSignal|KillUserProcesses|LidSwitchIgnoreInhibited|LimitAS|LimitCORE|LimitCPU|LimitDATA|LimitFSIZE|LimitLOCKS|LimitMEMLOCK|LimitMSGQUEUE|LimitNICE|LimitNOFILE|LimitNPROC|LimitRSS|LimitRTPRIO|LimitRTTIME|LimitSIGPENDING|LimitSTACK|link_priority|valueListenDatagram|ListenFIFO|ListenMessageQueue|ListenNetlink|ListenSequentialPacket|ListenSpecial|ListenStream|LogColor|LogLevel|LogLocation|LogTarget|luks|_MACHINE_ID|MakeDirectory|Mark|MaxConnections|MaxFileSec|MaxLevelConsole|MaxLevelKMsg|MaxLevelStore|MaxLevelSyslog|MaxRetentionSec|MemoryLimit|MemorySoftLimit|MESSAGE|MESSAGE_ID|MessageQueueMaxMessages|MessageQueueMessageSize|__MONOTONIC_TIMESTAMP|MountFlags|NAME|NAutoVTs|Nice|NonBlocking|NoNewPrivileges|NotifyAccess|OnActiveSec|OnBootSec|OnCalendar|OnFailure|OnFailureIsolate|OnStartupSec|OnUnitActiveSec|OnUnitInactiveSec|OOMScoreAdjust|Options|output|PAMName|PartOf|PassCredentials|PassSecurity|PathChanged|PathExists|PathExistsGlob|PathModified|PermissionsStartOnly|_PID|PIDFile|PipeSize|PowerKeyIgnoreInhibited|PRETTY_HOSTNAME|PRETTY_NAME|Priority|PRIORITY|PrivateNetwork|PrivateTmp|PropagatesReloadTo|pss|RateLimitBurst|RateLimitInterval|ReadOnlyDirectories|ReadWriteDirectories|__REALTIME_TIMESTAMP|ReceiveBuffer|RefuseManualStart|RefuseManualStop|rel|ReloadPropagatedFrom|RemainAfterExit|RequiredBy|Requires|RequiresMountsFor|RequiresOverridable|Requisite|RequisiteOverridable|ReserveVT|ResetControllers|Restart|RestartPreventExitStatus|RestartSec|RootDirectory|RootDirectoryStartOnly|RuntimeKeepFree|RuntimeMaxFileSize|RuntimeMaxUse|RuntimeWatchdogSec|samples|scale_x|scale_y|Seal|SecureBits|_SELINUX_CONTEXT|SendBuffer|SendSIGKILL|Service|ShowStatus|ShutdownWatchdogSec|size|SmackLabel|SmackLabelIPIn|SmackLabelIPOut|SocketMode|Sockets|SourcePath|_SOURCE_REALTIME_TIMESTAMP|SplitMode|StandardError|StandardInput|StandardOutput|StartLimitAction|StartLimitBurst|StartLimitInterval|static_node|StopWhenUnneeded|Storage|string_escape|none|replaceSuccessExitStatus|SupplementaryGroups|SUPPORT_URL|SuspendKeyIgnoreInhibited|SyslogFacility|SYSLOG_FACILITY|SyslogIdentifier|SYSLOG_IDENTIFIER|SyslogLevel|SyslogLevelPrefix|SYSLOG_PID|SystemCallFilter|SYSTEMD_ALIAS|_SYSTEMD_CGROUP|_SYSTEMD_OWNER_UID|SYSTEMD_READY|_SYSTEMD_SESSION|_SYSTEMD_UNIT|_SYSTEMD_USER_UNIT|SYSTEMD_WANTS|SystemKeepFree|SystemMaxFileSize|SystemMaxUse|SysVStartPriority|TCPCongestion|TCPWrapName|timeout|TimeoutSec|TimeoutStartSec|TimeoutStopSec|TimerSlackNSec|Transparent|_TRANSPORT|tries|TTYPath|TTYReset|TTYVHangup|TTYVTDisallocate|Type|_UID|UMask|Unit|User|UtmpIdentifier|VERSION|VERSION_ID|WantedBy|Wants|WatchdogSec|What|Where|WorkingDirectory)="

View File

@@ -2,7 +2,7 @@ filetype: tcl
detect:
filename: "\\.tcl$"
header: "^#!.*/(env +)?tclsh( |$)"
signature: "^#!.*/(env +)?tclsh( |$)"
rules:
- statement: "\\b(after|append|array|auto_execok|auto_import|auto_load|auto_load_index|auto_qualify|binary|break|case|catch|cd|clock|close|concat|continue|else|elseif|encoding|eof|error|eval|exec|exit|expr|fblocked|fconfigure|fcopy|file|fileevent|flush|for|foreach|format|gets|glob|global|history|if|incr|info|interp|join|lappend|lindex|linsert|list|llength|load|lrange|lreplace|lsearch|lset|lsort|namespace|open|package|pid|puts|pwd|read|regexp|regsub|rename|return|scan|seek|set|socket|source|split|string|subst|switch|tclLog|tell|time|trace|unknown|unset|update|uplevel|upvar|variable|vwait|while)\\b"

View File

@@ -2,7 +2,7 @@ filetype: xml
detect:
filename: "\\.(xml|sgml?|rng|svg|plist)$"
header: "<\\?xml.*\\?>"
signature: "<\\?xml.*\\?>"
rules:
- preproc:

View File

@@ -2,7 +2,7 @@ filetype: yaml
detect:
filename: "\\.ya?ml$"
header: "%YAML"
signature: "%YAML"
rules:
- type: "(^| )!!(binary|bool|float|int|map|null|omap|seq|set|str) "

View File

@@ -2,7 +2,7 @@ filetype: zsh
detect:
filename: "(\\.zsh$|\\.?(zshenv|zprofile|zshrc|zlogin|zlogout)$)"
header: "^#!.*/(env +)?zsh( |$)"
signature: "^#!.*/(env +)?zsh( |$)"
rules:
## Numbers