LDC #62: Too Many Links and Not Enough Time

Friday, December 08. 2017

LDC #62: Too Many Links and Not Enough Time

If you have ever wanted to download a bunch of HTML files from a website, you either need to do it manually (yuck!) or find some browser extension to help you along. Today we can offer a third option: using Legato. Since Legato can parse HTML files, you can quickly make a script that reads a website and decides what links to follow and which ones to ignore. Many browser extensions only let you filter by type of file, but with Legato the sky is the limit.

This week’s script downloads the technical specification section of the SEC’s website. The base page, if you are curious, is here. Let’s begin by looking at the complete script:


//
//
//      GoFiler Legato Script - Website Download
//      ----------------------------------------
//
//      Rev             12/08/2017
//
//
//      (c) 2017 Novaworks, LLC -- All rights reserved.
//
//      Basic script to download a website
//

#define URL                     0
#define FILE                    1


    int                 load_html(string, string);
    int                 extract_html(string);
    int                 console_print_line(string);


    string              table[][];
    string              skip[];
    string              fnSite;
    string              fnSrcPath;
    string              fnRelPath;
    string              fnDestPath;


int main() {
    string              s1, s2, s3;
    dword               type;
    int                 sx, size;
    int                 rc;

    // Site
    fnSite = "https://www.sec.gov";

    // Source Path
    fnSrcPath = fnSite + "/oit/Article/";

    // Allowed Relative Path (also allows source)
    fnRelPath = fnSite + "/info/edgar/";

    // Destination Location
    fnDestPath = AddPaths(GetScriptFolder(), "Files\\");

    // Starting File
    table[sx][URL] = fnSrcPath + "info-edgar-tech-specs.html";

    // Open Console
    console_print_line("Starting download...");

    size = 1;
    while (sx < size) {
      s1 = GetFilePath(table[sx][URL]);
      s1 = MakePathRelative(fnSrcPath, s1);
      if (s1 == "") {
        s1 = GetFilePath(table[sx][URL]);
        s1 = MakePathRelative(fnRelPath, s1);
        }
      if (s1 == "") {
        if (table[0][URL] != table[sx][URL]) {
          console_print_line(FormatString("   Path not relative for %s", table[sx][URL]));
          sx++;
          continue;
          }
        }

      // Create Path as Required
      s2 = GetFilename(table[sx][URL]);
      if (s2 == "") { s2 = "index.htm"; }
      s1 = fnDestPath + s1;
      s1 = PathToMSDelimiters(s1);
      CreateFolders(s1);

      table[sx][FILE] = s1 + s2;
      StatusBarMessage("%d %s", sx, table[sx][URL]);
      console_print_line(FormatString("Download %04d/%04d: %s", sx+1, size, table[sx][URL]));
      s2 = table[sx][FILE];
      rc = ERROR_NONE;
      if (IsFile(table[sx][FILE]) == FALSE) {
        rc = HTTPGetFile(table[sx][URL], table[sx][FILE]);
        }
      if (IsError(rc)) {
        console_print_line(FormatString("   ERROR 0x%08X on HTTP Get File for %s", rc, table[sx][URL]));
        }
      type = GetFileTypeCode(s2, TRUE);
      if (type == FT_HTML) {
        load_html(table[sx][FILE], table[sx][URL]);
        size = ArrayGetAxisDepth(table, AXIS_ROW);
        extract_html(table[sx][FILE]);
        }

      sx++;
      }

    console_print_line("Done!");
    return ERROR_NONE;
    }

int load_html(string name, string url) {
    handle              hSGML;
    string              s1, s2, s3;
    int                 dx, sx;
    int                 rc;

    console_print_line(FormatString("  Processing: %s", name));
    hSGML = SGMLCreate(name);
    if (hSGML == NULL_HANDLE) {
      rc = GetLastError();
      console_print_line(FormatString("   Error 0x%08X loading as SGML: %s", rc, name));
      return rc;
      }
    dx = ArrayGetAxisDepth(table, AXIS_ROW);
    s1 = SGMLNextElement(hSGML);
    while (s1 != "") {
      s2 = SGMLGetParameter(hSGML, HA_SRC);
      if (s2 == "") {
        s2 = SGMLGetParameter(hSGML, HA_HREF);
        }
      // Ignore Relative
      if (s2[0] == '#') {
        s2 = "";
        }
      // Ignore Emails
      if (FindInString(s2, "mailto:") == 0) {
        s2 = "";
        }
      // Skip roots that aren't us
      if (IsPathQualifiedWeb(s2) && s2 != "") {
        if ((FindInString(s2, fnSrcPath) < 0) &&
            (FindInString(s2, fnRelPath) < 0)) {
          rc = FindInList(skip, s2);
          if (rc < 0) {
            sx = ArrayGetAxisDepth(skip);
            skip[sx] = s2;
            console_print_line(FormatString("    Skipping : %s", s2));
            }
          s2 = "";
          }
        }
      // Still okay?
      if (s2 != "") {

        // Qualify it
        if (IsPathQualifiedWeb(s2) == FALSE) {
          if (s2[0] == '/') {
            s2 = fnSite + s2;
            }
          else {
            s2 = AddPaths(GetFilePath(url), s2);
            }
          }
        // Force Secure Match
        s2 = ReplaceInString(s2, "http://", "https://");

        // Check in root
        if ((FindInString(s2, fnSrcPath) == 0) ||
            (FindInString(s2, fnRelPath) == 0)) {
          s2 = ClipQueryOrID(s2);
          rc = FindInTable(table, s2);
          if (rc < 0) {
            table[dx][URL] = s2;
            console_print_line(FormatString("    Adding : %s", s2));
            dx++;
            }
          }
        else {
          rc = FindInList(skip, s2);
          if (rc < 0) {
            sx = ArrayGetAxisDepth(skip);
            skip[sx] = s2;
            console_print_line(FormatString("    Skipping : %s", s2));
            }
          }
        }
      s1 = SGMLNextElement(hSGML);
      }
    return ERROR_NONE;
    }

int extract_html(string name) {
    handle              hSGML;
    string              s1, s2;
    int                 rc;

    // Extract Article
    console_print_line("    Looking Main Content...");
    hSGML = SGMLCreate(name);
    if (hSGML == NULL_HANDLE) {
      rc = GetLastError();
      console_print_line(FormatString("   Error 0x%08X loading as SGML: %s", rc, name));
      return rc;
      }
    s1 = SGMLNextElement(hSGML);
    while (s1 != "") {
      if (FindInString(s1, "id=\"main-content\"") > 0) {
        s2 = SGMLFindClosingElement(hSGML, SP_FCE_CODE_AS_IS);
        break;
        }
      s1 = SGMLNextElement(hSGML);
      }
    if (s2 == "") {
      console_print_line("      No main content! Deleting File.");
      DeleteFile(name);
      return ERROR_NONE;
      }
    CloseHandle(hSGML);
    StringToFile(s2, name);

    console_print_line("      Done.");
    return ERROR_NONE;
    }

int console_print_line(string txt) {
    return ConsolePrint(txt + "\r\n");
    }

We start with two defines for URL and FILE. These are will help us use the two dimensional array table, which has both the URL of the file and the filename on disk that contains it. We then have a few predefines for functions. The process is split into three parts: the main function that loops over our table, the load_html function that scans an HTML file for more links to download, and the extract_html function that removes extraneous parts from the download. Depending on your individual needs, this last function could be empty. The console_print_line function is a wrapper for the ConsolePrint SDK function that simply adds a line return to the message.


#define URL                     0
#define FILE                    1


    int                 load_html(string, string);
    int                 extract_html(string);
    int                 console_print_line(string);

We then have our global variables. The table variable was covered above. skip is an array of files we are skipping. The next few variables are the website from which we are downloading, the starting file, an additional relative path (the SEC stores downloads in a different location from the HTML), and the destination download directory.


    string              table[][];
    string              skip[];
    string              fnSite;
    string              fnSrcPath;
    string              fnRelPath;
    string              fnDestPath;

Let’s dive right into the main function. The first thing we will do is set up our global variables from above. Then we will begin the download loop. We will look at the size of the table array and then take a file from the array. We will then try to qualify the file using the fnSrcPath or fnRelPath functions. We do this so we don’t start downloading into other websites.


int main() {
    string              s1, s2, s3;
    dword               type;
    int                 sx, size;
    int                 rc;

    // Site
    fnSite = "https://www.sec.gov";

    // Source Path
    fnSrcPath = fnSite + "/oit/Article/";

    // Allowed Relative Path (also allows source)
    fnRelPath = fnSite + "/info/edgar/";

    // Destination Location
    fnDestPath = AddPaths(GetScriptFolder(), "Files\\");

    // Starting File
    table[sx][URL] = fnSrcPath + "info-edgar-tech-specs.html";

    // Open Console
    console_print_line("Starting download...");

    size = 1;
    while (sx < size) {
      s1 = GetFilePath(table[sx][URL]);
      s1 = MakePathRelative(fnSrcPath, s1);
      if (s1 == "") {
        s1 = GetFilePath(table[sx][URL]);
        s1 = MakePathRelative(fnRelPath, s1);
        }
      if (s1 == "") {
        if (table[0][URL] != table[sx][URL]) {
          console_print_line(FormatString("   Path not relative for %s", table[sx][URL]));
          sx++;
          continue;
          }
        }

We then get the filename and give it a specific name if it doesn’t have one. This is in case where a webserver is automatically picking a file, like index.htm, for the request. We then convert it to a Windows path and create the destination folder if needed.


      // Create Path as Required
      s2 = GetFilename(table[sx][URL]);
      if (s2 == "") { s2 = "index.htm"; }
      s1 = fnDestPath + s1;
      s1 = PathToMSDelimiters(s1);
      CreateFolders(s1);

After, the function checks if the file is already downloaded (there’s no sense in doing work twice), and if it is not, we download the file. If it is an HTML file, we do additional processing on it using the load_html and extract_html functions. Finally, we increment the counter and go back to the start. If the loop is done, we print out “Done!” to the user and end the script.


      table[sx][FILE] = s1 + s2;
      StatusBarMessage("%d %s", sx, table[sx][URL]);
      console_print_line(FormatString("Download %04d/%04d: %s", sx+1, size, table[sx][URL]));
      s2 = table[sx][FILE];
      rc = ERROR_NONE;
      if (IsFile(table[sx][FILE]) == FALSE) {
        rc = HTTPGetFile(table[sx][URL], table[sx][FILE]);
        }
      if (IsError(rc)) {
        console_print_line(FormatString("   ERROR 0x%08X on HTTP Get File for %s", rc, table[sx][URL]));
        }
      type = GetFileTypeCode(s2, TRUE);
      if (type == FT_HTML) {
        load_html(table[sx][FILE], table[sx][URL]);
        size = ArrayGetAxisDepth(table, AXIS_ROW);
        extract_html(table[sx][FILE]);
        }

      sx++;
      }

    console_print_line("Done!");
    return ERROR_NONE;
    }

Now this is where things get more interesting. The load_html function analyzes the downloaded file and searches for other files to download. Since there are many ways links can be written in HTML you may have guessed that this function is more complicated than the others. We start by creating an SGML Object using the complete path to the downloaded file. We also get the number of entries in the table array so we know where we can add entries.


int load_html(string name, string url) {
    handle              hSGML;
    string              s1, s2, s3;
    int                 dx, sx;
    int                 rc;

    console_print_line(FormatString("  Processing: %s", name));
    hSGML = SGMLCreate(name);
    if (hSGML == NULL_HANDLE) {
      rc = GetLastError();
      console_print_line(FormatString("   Error 0x%08X loading as SGML: %s", rc, name));
      return rc;
      }
    dx = ArrayGetAxisDepth(table, AXIS_ROW);

Once our prep work is complete, we can loop over the file’s elements using the SGMLNextElement function. We don’t care about any textual data so we don’t need to use the SGMLNextItem function. We then get the value of the SRC or HREF HTML parameters using the SGMLGetParameter function. If the tag doesn’t have these parameters, the function will return an empty string. Now we need to process the parameter data. If we don’t want a link we can set s2 to be empty to have the rest of our code treat the tag as if it didn’t have a link. The first thing we do is eliminate local file links. We then remove email links (can also remove other protocols as well). Now that the easy processing is out of the way, we can get into the harder stuff.


    s1 = SGMLNextElement(hSGML);
    while (s1 != "") {
      s2 = SGMLGetParameter(hSGML, HA_SRC);
      if (s2 == "") {
        s2 = SGMLGetParameter(hSGML, HA_HREF);
        }
      // Ignore Relative
      if (s2[0] == '#') {
        s2 = "";
        }
      // Ignore Emails
      if (FindInString(s2, "mailto:") == 0) {
        s2 = "";
        }

The next few lines remove links that are fully qualified but are not to our website. It’s important to note that this code does not deal with the link type <a href="//www.something.com..." but it could be modified to do so. If the link is to a root location that is not ours, we add it to the skip list.


      // Skip roots that aren't us
      if (IsPathQualifiedWeb(s2) && s2 != "") {
        if ((FindInString(s2, fnSrcPath) < 0) &&
            (FindInString(s2, fnRelPath) < 0)) {
          rc = FindInList(skip, s2);
          if (rc < 0) {
            sx = ArrayGetAxisDepth(skip);
            skip[sx] = s2;
            console_print_line(FormatString("    Skipping : %s", s2));
            }
          s2 = "";
          }
        }

The next section deals with partial links. We will qualify them using the current document and site and then check to see if they are an in our relative path. To do this, we first check to see if the link is qualified using the IsPathQualifiedWeb function. If it isn’t, we then check if it starts with “/”. If it does, this path is a path that is relative to the site (not the file). If it doesn’t, we assume it is relative to the file. We then adjust the path accordingly.


      // Still okay?
      if (s2 != "") {

        // Qualify it
        if (IsPathQualifiedWeb(s2) == FALSE) {
          if (s2[0] == '/') {
            s2 = fnSite + s2;
            }
          else {
            s2 = AddPaths(GetFilePath(url), s2);
            }
          }

Next we do a quick cheat to deal with some links being secure and some not being secure. We then check to see if the link is allowed within our relative path (and source path). If it is, we remove any special parameters from the link using the ClipQueryOrID function and we add it to table. If it was not in the path, we add it to skip. After all that link processing we are now ready to get the next tag using the SGMLNextElement function again.


        // Force Secure Match
        s2 = ReplaceInString(s2, "http://", "https://");

        // Check in root
        if ((FindInString(s2, fnSrcPath) == 0) ||
            (FindInString(s2, fnRelPath) == 0)) {
          s2 = ClipQueryOrID(s2);
          rc = FindInTable(table, s2);
          if (rc < 0) {
            table[dx][URL] = s2;
            console_print_line(FormatString("    Adding : %s", s2));
            dx++;
            }
          }
        else {
          rc = FindInList(skip, s2);
          if (rc < 0) {
            sx = ArrayGetAxisDepth(skip);
            skip[sx] = s2;
            console_print_line(FormatString("    Skipping : %s", s2));
            }
          }
        }
      s1 = SGMLNextElement(hSGML);
      }
    return ERROR_NONE;
    }

Finally we can talk about our extract_html function. This function is designed to remove the extraneous stuff that most websites have around the content (such as headers and footers). If we want that information, we can make this routine do nothing. Right now it looks for a tag with a specific id, and if it finds that tag, it will replace the file with the contents of the tag. We could also just verify such a tag exists and, if it doesn’t, delete the file. This is where your own customization really comes into play. Maybe you want to download only files from the site that contain images, or maybe you want to download files that link to other websites. The possibilities are endless. It is important to note that the reason this is done as a post process step is because we don’t want to ignore links in files we don’t want. Just because we don’t want the file doesn’t mean it doesn’t have a link to a file we do want.

Our function is pretty simple. It starts by creating an SGML Object using the file. Then it will get elements looking for our specific id. If it finds that id, it will use the SGMLFindClosingElement function to get the contents of the tag. In this example we are getting the HTML code as-is but we could just retrieve the textual information. If the we didn’t find the tag, we remove the file from the downloaded files since it wasn’t what we wanted. You could not call the DeleteFile function here if you wanted to keep the file anyway. Finally, we also replace the file with the contents of the tag using the StringToFile function.


int extract_html(string name) {
    handle              hSGML;
    string              s1, s2;
    int                 rc;

    // Extract Article
    console_print_line("    Looking Main Content...");
    hSGML = SGMLCreate(name);
    if (hSGML == NULL_HANDLE) {
      rc = GetLastError();
      console_print_line(FormatString("   Error 0x%08X loading as SGML: %s", rc, name));
      return rc;
      }
    s1 = SGMLNextElement(hSGML);
    while (s1 != "") {
      if (FindInString(s1, "id=\"main-content\"") > 0) {
        s2 = SGMLFindClosingElement(hSGML, SP_FCE_CODE_AS_IS);
        break;
        }
      s1 = SGMLNextElement(hSGML);
      }
    if (s2 == "") {
      console_print_line("      No main content! Deleting File.");
      DeleteFile(name);
      return ERROR_NONE;
      }
    CloseHandle(hSGML);
    StringToFile(s2, name);

    console_print_line("      Done.");
    return ERROR_NONE;
    }

Running the script as written will download all the SEC technical specification pages and their linked zip files. It will also download a file named “erica.eveland@attain.com” since the SEC’s website has an email link that is badly formed (missing “mailto:”).

There you have it: a short Legato script that can download a website or any parts of a website. You may have noticed that this code does not deal with JavaScript URL references, as that is a rather complex processes we may cover some other day. You could easily alter this script to only download images or PDFs, check for broken links, generate a sitemap, or perform any number of possibilities!

David Theis has been developing software for Windows operating systems for over fifteen years. He has a Bachelor of Sciences in Computer Science from the Rochester Institute of Technology and co-founded Novaworks in 2006. He is the Vice President of Development and is one of the primary developers of GoFiler, a financial reporting software package designed to create and file EDGAR XML, HTML, and XBRL documents to the U.S. Securities and Exchange Commission.