###
###  AWK Script to clean up an OpenPegasus trace by modifiying the info
###  for a number of fields including
###      - Change from absolute time to relative time to make seeing
###        time between events easier
###      - Change from numberic thread ID to symbolic threadID where the
###        symbolic thread ID is THREADxx 
###      - Indent XML that is not indented in the trace
###      - Display only trace lines after a defined point where the
###      - defined point is the first trace line with a defined Component
###        The component is defined by the startElement variable on the
###        command line with default of XmlIO
###       - set Marker on selected lines.  Selection defined by input file
###        This requires match of each item in the input array of strings
###        against each line
###
#### TODO
#### Add filter on display by component type
#### Add color for particular things we find (Color would be option vs some
####    other marker for particular things (ex. error found)
#### Find way to trace through complete operation from thread to thread
#### Find way to match enter and exit calls. Could be indent or some marker.
#### Would probably assume same thread

#### NOTE: This code uses one regex expression specific to to posix awk
####       the expression interval (i.e. the {number}.  This may requie
####       that the calling function include either the --posix or
####       -re-interval option on the command line to include this
####       functionality. If the time and thread info do not translate
####       it is probably this incompatibility..
#### 
BEGIN {
      false = 0          ## defintion of true and false values for readability
      true = 1

      startTime = 0       ## Time of first trace entry 
      threadID = 1        ## variable to hold current thread number
      processID = 1       ## variable to hold current process number
      showOutput = 0      ## Boolean flag defines when output display is started
      indent = 0          ## indent level for xml
      isCompleteElement = false;
      keywordsExist = false   ## Boolean enables or disables keyword search
                          ## disabled unless keywords variable is non-zero
      outputLineCount = 0 ## count of lines displayed

      namesFile = "FileNames.txt"  ## default name for the nameFile, the file that
                          ## is built at end and contains summary information

      fileName = "None"   ## filename on input. May be supplied as variable or 
                          ## or derived from first input record
      outputFileSuffix = ".fmtd"
      outputFileName = fileName outputFileSuffix

      markThisLine = false   ## indicating that the current line will be marked
                             ## with leading + per

      markCount = 0       ## count of marks set.

      ## If the name of the trace components is not defined on the cmd line
      ## default it to XmlIO so that we do not show anything until the
      ## first XmlIO record is encountered.
      ## NOTE: This is too limited in that it does not allow having
      ## no startElement, i.e. display starting at first line of trace.

      print "Cmd line input of startElement=" startElement "." "length= " length(startElement)
      if (startElement == "")
      {
         startElement = "XmlIO"
      }

      if (length(startElement))
      {
         print "Display Starts at first Trace line component  = " startElement
      }
      else
      {
         print "Display starts at first line of original trace"
      }

      ## Get the pegasus variables
      pegRoot = ENVIRON["PEGASUS_ROOT"]

      ## Create the keywordRegex from list of possible strings
      ## Comma separated list of keywords which will be marked on output
      keywords = "XmlIO,error,IMETHODCALL,TestServerProfileProvider,Request: queue id:,"
      keywords = keywords "Response has error,Enumeration Set Error msg,After putCache insert,"
      keywords = keywords "TestServerProfileProvider:,putcache,getcache,"
      keywords = keywords "setupFuture,_issueImmediateOpenOr,_issueOpenOr"
      keywords = keywords "IMETHODRESPONSE,SetupFutureResponse,"
      keywords = keywords "ContextId=,"
      keywords = keywords "setProcessingState,"
      keywords = keywords "setProcessingState,"
      keywords = keywords "StopTimer,"
      keywords = keywords "StartTimer"
      if (keywords == "")
      {
         ##keywords = "error,IMETHODCALL,TestServerProfileProvider"
      }

      ## Need to test to be sure no | in keywords

      if (keywords != "")
      {
         keywordsExist = true

         ## split the input keyword list and create a regex or statement
         numKeywords = split(keywords, keyWordArray,",")
         if (numKeywords > 1)
         {
            keywordRegex = keyWordArray[1]
            for ( i = 2; i <= numKeywords; i++)
            {
               keywordRegex = keywordRegex "|" keyWordArray[i]
            }
         }
         print "Mark lines with the following keywords", keywordRegex
      }

      ## Exclude components filter. This is a command line input parameter
      ## TODO all these should be input in form a,b,c and we map to the
      ## regex.
      excludeComponentFilter = ""

      if (excludeComponentFilter != "")
      {
         print "Excluding the following components from output", excludeComponentFilter
      }
}

############################################################################
##
##       Process the file
##
###########################################################################
##
## Get the first input file name and create the output file name based on
## this name. If filename was already set, do not change. This is done once
## on the first line input because the name on the input line does not exist
## in FILENAME until after the first read.
FNR == NR && NR == 1 {
   fileName = FILENAME

   if (match(fileName,"\\.tmp$"))
   {
      ## remove the tmp suffix since we want only the main file name
      ## for output
      sub("\\.tmp$", "", fileName)
   }

   ## define the output file name and initialize it empty
   outputFileName = fileName outputFileSuffix

   print "output fileName= ", outputFileName
}

## Trace element name that acts as flag to turn on trace. Output is enabled
## first time the element name defined by startElement is encountered.
## Once turned on, there is no way to turn off the output
!(showOutput) && ($2 == startElement) {
   showOutput = 1
   print "Start Display at Line " FNR " in file " FILENAME " TraceComponent " startElement
}

## Test to see if this line is to be marked by matching to the regexList
## If it matches, set the markThisLine variable so it can be marked on output
(keywordsExist){
   if (match($0,keywordRegex))
   {
      markThisLine = true
   }
   else
   {
      markThisLine = false
   }
}

## Scan line for XML Line that is not indented.  We will indent all
## consecutive XML lines in accord with xml rules
## TODO: Modify so we do not indent the first line.
{ if($0 ~ /^</)
   {
      #### print "FOUND XML START"
      inXMLState = true
      if ($1 == "<?xml")
      {
         indent = 0
      }
      if ($1 == "</CIM>")
      {
         indent = -1
      }
   }
   ## non-consecutive xml line found. turn off xml indent
   else
   {
      #### print "FOUND XML END"
      inXMLState = false
   }
}

## Indent any unindented XML
## If inXml state indent the line by indent variable
## this indents all consecutive lines that start with the XML left caret
## character in line with their xml indent level.
## NOTE: It does not indent any case of xml on a single line
(inXMLState) {
   #### DEBUG for xml displayprint "FOUND NEXT XML Line", "indent=" indent, " isComplete " isCompleteElement, $0
   ## if this is not complete element (i.e start and end on single line
   ## include </elementName> as possible terminator on same line
   ### issue with element that terminates on same line. needs to indent and
   ### then outdent after print.
##          <IMETHODCALL NAME="References">
##            <LOCALNAMESPACEPATH>
##            <NAMESPACE NAME="test"/>
##            <NAMESPACE NAME="TestProvider"/>
##          </LOCALNAMESPACEPATH>

   elementName = substr($1,2)
   terminator = "</" elementName ">$"
   ## if this line is an entity close element
   ## TODO why the second char is /. seems to match </A/>
   if ($0 ~ "^</[A-Z]/>$")
   {
      type = 1
      indent--
      isCompleteElement = true
   }
   ## if this is a complete start and end pair
   else if (($0 ~ "^<[A-Z]+>.+</[A-Z]+>$"))
   {
      type = 6
      ## Really should match the element name begin and end
      ## complete open and close of an element
      if (!isCompleteElement)
      {
         indent++
      }
      isCompleteElement = true
   }
   ## else if this is a complete element
   ## <NAMESPACE NAME="PG_InterOp"/> This is NOT a 5. It is a 2
   ## 11 2                      <NAMESPACE NAME="root"/>
   ## 11 5                      <NAMESPACE NAME="PG_InterOp"/> This should be type 2
   else if ($0 ~ "^<[A-Z]+[a-z=\" ]*/>$")
   {
      type = 2
      if (!isCompleteElement)
      {
         indent++
      }
      isCompleteElement = true
   }

   ## if there is a complete start element and end element
   ## <KEYVALUE VALUETYPE="string">PG:127-0-1-1</KEYVALUE>
   else if($0 ~ terminator)
   {
      type = 3
      if (!isCompleteElement)
      {
        indent++
        isCompleteElement = true
      }
   }
   else
   {
      ## if terminating element
      if ($0 ~ "^</")
      {
         ##if ((indent != 0) && (type != 3))
         if ((indent != 0) && isCompleteElement)
         {
            indent--
         }
         type = 4
         isCompleteElement = true
      }
      else
      {
        type = 5
        if (!isCompleteElement)
        {
           indent++
        }
        isCompleteElement = false
      }
   }

   ## create the indent string, (2 char indent per level)
   indentStr = ""
   for (x = 0; x < indent; x++)
   {
      indentStr = indentStr "  "
   }
   #### print "indentStr" indentStr "EndIndentStr", "indent=" indent, $0

   ## prepend the indent string to $0
   $0 = indentStr $0  
}

## input record times not corrected yet.  See consolidateTrace which
## should do this if called.  This allows direct use of this formatter
## with .trc files without the format conversion.
## substitute the time into the time field to move from absolute seconds
## to relative seconds. This must be the first field in the input variable
## substitute time of yyyy.zzzzzz for  relative sec and usec
#                     yyyyyyyyyys-zzzzzzs absolute sec and usec at BOL
## NOTE: This function was from before we used the consolidate.awk function
## to normalize the us time field.  If you use the formattrace script
## this should never be called.
/^[0-9]{10}s-[0-9]{1,6}us:/ {
   if (showOutput)
   {
      ## Normalize the time field for a sort from  seconds-microseconds
      ## seconds.microseconds as a single decimal number of fixed length
      split($1, timeFields,"-")
      sub("s$","",timeFields[1])
      sub("us:$","",timeFields[2])
      while (length(timeFields[2]) < 6)
      {
        timeFields[2] = "0" timeFields[2]
      }

      ## if this is the first entry, save the current time as start time.
      if (startTime == 0)
      {
         startTime = timeFields[1]
      }

      time = timeFields[1] - startTime
      newTime = sprintf("%.4u", time)
      newTime = newTime "." timeFields[2] ":"

      #### print "Time 1 " $1 " formatted to  " newTime " starttime " startTime
      $1 = newTime
    }
}

## If the file was already mapped to the new time format then we
## want to adjust to relative time which is a slightly different format
## than the original trace format. If you use the formattrace script
## this is the reformatter for the log record time field. It assumes that
## the second part of the field (us) is fixed length and the oveall format
## of the time field is
##      ssssssssss.uuuuuu:

/^[0-9]{10}.[0-9]{6}: / {
   if (showOutput)
   {
      ## if this is the first entry, save the current time as start time.
      split($1, timeFields,".")
      if (startTime == 0)
      {
         startTime = timeFields[1]
      }
      time = timeFields[1] - startTime
      newTime = sprintf("%.4u", time)
      newTime = newTime "." timeFields[2]

      #### print "Time 2 " $1 " formatted to  " newTime " starttime " startTime
      $1 = newTime

      ## get the ProcessName, threadName, etc.  from trace entry 3
      ## Use the two expressions because the second one required for a very
      ## few trace entries (ex. header files include complete path) but makes
      ## the execution extremly long so we try to match on first.
      if (($3 ~ /^\[[0-9]{2,5}:[0-9]{5,15}:[A-Za-z0-9_.]{1,40}:[0-9]{1,6}\]/ ) ||
          ($3 ~ /^\[[0-9]{2,5}:[0-9]{5,15}:[[:print:]]{1,120}:[0-9]{1,6}\]/ ))
      {
         split($3, procFields, ":")

         processName = substr(procFields[1],2)  ## get and delete leading [
         threadName = procFields[2]
         srcFileName = procFields[3]
         lineNum = procFields[4]

         ##print "Processinfo = " " |" $3 "| processName= ", processName, " ThreadName= ", threadName
         ## Display name for the thread. The unknown is for errors only
         localProcessName = "Unknown"
         localThreadName = "Unknown"

         ## if this is new process, add to the array of process names
         if (!(processName in processNameArray))
         {
             ## create the new process name
             x = sprintf("%.2u", processID++)
             processNameArray[processName] = ("P" x)
         }
         localProcessName = processNameArray[processName]

         ## if this is new thread, add to the array of thread names.
         if (!(threadName in threadNameArray))
         {
             ## create the new thread name
             x = sprintf("%.2u", threadID++)
             threadNameArray[threadName] = ("TH" x)
         }
         localThreadName = threadNameArray[threadName]

         ##print "ThreadElementFound ", $2, " thread info=" $3, " threadName=" threadName, "newThreadName=" newThreadName
         ## substitute the new name into $3
         $3 = "[" localProcessName ":" localThreadName ":" srcFileName ":" lineNum
      }
      else
      {
         print "No Match for field Process:Thread:Source:LineNum field " $3
         print "line " $0
      }
   }

   ## Accumulate info an functions entered and exited.
   ## We could actually accumulte time in function here. But more complex
   ## because of multitask so we would have to match input to output 
   ## threads and process.
   Action = $4 $5
   if (Action ~ /^Enteringmethod$/)
   {
      functionArray[$6]++
   }
   else if (Action ~ /^Exitingmedhod$/ )
   {
      functionArray[$6]--
   }

}

## exclude from display any lines where the component name matches the
## excludeFilter regex.  Note that this is a match against arguement 2
## directly.  This is the last item so that all other processing is complete
## before we decide to terminate.  Also, this could be directly done
## in the next pattern action just as well.
{ if ((excludeComponentFilter != "") && !inXmlState)
   {
      if (match($2,excludeComponentFilter))
      {
         next
      }
   }
}

## print any resulting line so all lines get output.\ once showOutput has
## been enabled.  This function
## inserts the marker in the line either a single + or minus
showOutput {
    marker = ""
    if (markThisLine != 0)
    {
        marker = "+"
        markCount++
    }
    outputLineCount++
    ## debugging output
    ##printf("%u %u%s%s\n", indent, type, marker, $0) >> outputFileName

    printf("%s%s\n", marker, $0) >> outputFileName
}

function USAGE()
{
    print "Usage: awk --posix -f formattrace.awk [variables] [inputTraceName]"

    print "Formats and organizes the OpenPegasus trace files "
    print "and produces an output file named CIM_Tr_*.fmtd"
    print "That:"
    print "    - consolidates the multiple trace files into a single file, "
    print "    - modifies the time relative to the first displayed entry"
    print "    - renames the thread and process ids to something more readable"
    print "    - Marks selected log entries so they can be found easily"
    print "    - Excludes selected comonents"
    print "    - Cuts off log entries before the first instance of a defined"
    print "      component (default XmlIO) which also serves as the relative"
    print "    - start time."
    print "where --posix is required for awk compatibility"
    print "  -f defines this awk file name"
    print "  variables are any variables that may be used by the program"
    print "     in the form -v name=value"
    print "  startElement=ComponentName that will start display output"
    print "    where default is XmlIO"
}

############################################################################
## Close by outputting the summary information to the namesFile
############################################################################
END{
    if (!(showOutput))
    {
      print "ERROR: Never found trace component to start display"

      print "ERROR: Never found trace component to start display" >> namesFile
    }

    print "Input " NR " lines from file " fileName "." >> namesFile
    ## The function not defined getStart = strftime("%b %d %Y-%T", startTime)
    ## apparentlynot defined because of posix compatibility flag.
    print "Original time for first record was " startTime " (" getStartTime ")"
    print markCount " lines marked" >> namesFile
    print "marks are: " keywordRegex >>namesFile
    print "Output " outputLineCount " lines to " outputFileName >> namesFile

    print "Thread IDs for ", arrayLength(threadNameArray) " threads" >> namesFile
    for (item in threadNameArray)
    {
        print item, threadNameArray[item] >> namesFile
    }
    print "" >> namesFile

    print "Process IDs for " arrayLength(processNameArray) " processes" >> namesFile
    for (item in processNameArray)
    {
        print item, processNameArray[item] >> namesFile
    }

    print "Count of functions called = " arrayLength(functionArray) >> namesFile
    for (item in funtionArray)
    {
       if ((functionArray[item] % 2) == 1)
       {
          print "No exit match for function" item >> namesFile
       }
    }

    for (item in funtionArray)
    {
          print item " "(functionArray[item] / 2) " " >> namesFile
    }
   ## show this on console as last thing because it is handy for next
   ## step which is usually edit of the file.
    print "output fileName= ", outputFileName
}


###########################################################################
## Miscellaneous functions


## function to get length of an array.  This exists because awk inconsistent
## between implementation about the existence of a size or length function
## for arrays.  One input, the array. Returns the count if items in the
## array. The variable i and k are local
function arrayLength( a, i, k) {
    for( i in a) k++
    return k
}