// Client for Project 1, phase 1 of EE 122 Fall 2007. // // Written by Daniel Killebrew. // // The client // (1) reads a URL from standard in or stops at EOF // (2) connects to the server and sends a GET request // (3) prints to standard out the server's response // (4) waits for server to close connection // (5) goes back to step (1) #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; const char * defaultPort = "80"; const char * defaultPath = "/"; // should we print out extra info to stdout? bool verbose = false; // was persistent specified on command line? bool persistent = false; // did the server indicate that it supports persistent in its // most recent response? bool serverSupportsPersistent = false; // data we have read from the socket but not processed yet string partialData; // the socket to the server int sock; // if we are now receiving a file from the server bool receiving; // size of the file the server is sending us, in bytes unsigned int contentLength; // if the server is sending using chunking bool chunked; // if the socket to server should still be open bool socketOpen; // Attempts to send sendAmt of data from the given buffer. // Returns how many bytes were actually sent, which could be fewer if // an error occurs (including the other side closing the socket). int sendAll(int sock, const char *buf, int sendAmt) { int justSent, sentSoFar = 0; while (sendAmt > 0) { justSent = send(sock, buf, sendAmt, 0); if (justSent < 0) return sentSoFar; sentSoFar += justSent; sendAmt -= justSent; } return sentSoFar; } // Read from standard in until we get a newline; // that will mean we've gotten an 'http://' from the user. int grabFromStdin(string &partialLine, string &fullURL) { const int size = 128; char buf[size]; string::size_type newlineIndex; // Grab data from standard in until we encounter \n. do { char *retval = fgets(buf, size, stdin); if (retval == NULL) { // An error or End of File. if (ferror(stdin)) printf("Error reading from standard in. Closing.\n"); // Indicate that we are done reading from stdin. return 0; } partialLine.append(retval); newlineIndex = partialLine.find("\n"); } while (newlineIndex == string::npos); // Copy everything up to, not including, the newline. fullURL.assign(partialLine, 0, newlineIndex); // Delete what we copied, along with the newline. But we // need to keep the remainder, since fgets() may have started // reading into the next line. partialLine.erase(0, newlineIndex+1); return 1; } // Split the URL that the user types in into its component parts. // Return these through the references that are passed in. int parseURL(string &fullURL, string &hostmatch, string &portmatch, string &pathmatch) { // HTTP-URL = "http://" Host ?Port ?Path // Host = Hostname | Hostnumber // Hostname = ALPHA *AlphaNum ?("." Hostname) // Hostnumber = +DIGIT "." +DIGIT "." +DIGIT "." +DIGIT // Port = ":" +DIGIT // Path = "/" *FileNameChar // FileNameChar = ALPHA | DIGIT | "." | "-" | "_" | "/" // AlphaNum = ALPHA | DIGIT // Don't forget that '\' is special to C, so we must escape it // with another '\'. // // We match the host, port and path using \(\) so we can extract // them as RE subexpressions: // // \(hostname|hostnumber\)\(port\)?\(path\)?$"; // // This makes for a complicated set of regular expressions. // You can read up on them if you are curious: // http://www.opengroup.org/pubs/online/7908799/xbd/re.html#tag_007_003 // http://www.opengroup.org/pubs/online/7908799/xsh/regexec.html string start = "^[hH][tT][tT][pP]://"; string hostname = "[[:alpha:]][[:alpha:][:digit:]]*(\\.[[:alpha:]][[:alpha:][:digit:]]*)*"; string hostnumber = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; string port = ":([0-9]+)"; // Note that '-' must come last in the character class here, // because otherwise it has a different meaning (indicating // a *range* of characters to match). string path = "/[[:alpha:][:digit:]._/-]*"; // Match the hostname/hostnumber: string pattern; pattern.append(start).append("("); pattern.append(hostname).append("|"); pattern.append(hostnumber).append(")"); string remaining = fullURL; regex_t re; const int nmatches = 4; regmatch_t matches[nmatches]; int ret; if (regcomp(&re, pattern.c_str(), REG_EXTENDED) != 0) { printf("couldn't build regular expression\n"); return (0); /* report error */ } ret = regexec(&re, remaining.c_str(), nmatches, matches, 0); // All done with the pattern-matcher, delete it. regfree(&re); if (ret == REG_NOMATCH) { printf("ERROR -- Invalid URL: %s\n", fullURL.c_str()); return 0; } else if (ret != 0) { printf("ERROR -- regexec failed\n"); return 0; } // Extract the matching parts of host. hostmatch = remaining.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); // Now make a string from the remaining URL. remaining = remaining.substr(matches[0].rm_eo, fullURL.length()); // Match port, if it's there. pattern = "^"; pattern.append(port); portmatch = defaultPort; if (regcomp(&re, pattern.c_str(), REG_EXTENDED) != 0) { printf("regcomp failure\n"); return (0); /* report error */ } ret = regexec(&re, remaining.c_str(), nmatches, matches, 0); regfree(&re); // done with the pattern-matcher if (ret == 0) { // We matched. portmatch = remaining.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); // Now make a string from the remaining URL. remaining = remaining.substr(matches[0].rm_eo, fullURL.length()); } // Match the path, if it's there, and match the end of the URL. pattern = "^("; pattern.append(path).append(")?$"); // path component is optional pathmatch = defaultPath; if (regcomp(&re, pattern.c_str(), REG_EXTENDED) != 0) { printf("regcomp failure\n"); return (0); /* report error */ } ret = regexec(&re, remaining.c_str(), nmatches, matches, 0); regfree(&re); // done with the pattern-matcher if (ret == 0) { // We matched. if (matches[1].rm_so != -1) // Check whether there actually was a path. pathmatch = remaining.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); } else if (ret == REG_NOMATCH) { printf("ERROR -- Invalid URL: %s\n", fullURL.c_str()); return 0; } return 1; } // Connect to the server with this hostname and port. // The socket will be stored into the socket descriptor passed in ("sock"). bool connectToServer(string host, int port, int &sock) { struct hostent *hent; struct sockaddr_in saddr; if ((hent = gethostbyname(host.c_str())) == NULL) { printf("ERROR -- Network Error: gethostbyname error\n"); return false; } sock = socket(PF_INET, SOCK_STREAM, 0); memset(&saddr, 0, sizeof(struct sockaddr_in)); saddr.sin_family = AF_INET; saddr.sin_port = htons(port); saddr.sin_addr = *((struct in_addr *) hent->h_addr); int ret = connect(sock, (sockaddr*) &saddr, sizeof(saddr)); if (ret == 0) return true; else { perror("ERROR -- Network Error"); return false; } } // Create the GET request and send it on "sock". bool sendRequest(string path, int sock) { string GETrequest = "GET "; GETrequest.append(path); GETrequest.append(" HTTP/1.0\r\n"); if (persistent) GETrequest.append("Connection: Keep-Alive\r\n"); GETrequest.append("\r\n"); if (verbose) cout << "Sending request: >" << GETrequest << "<" << endl; unsigned int bytes = sendAll(sock, GETrequest.c_str(), GETrequest.length()); return bytes == GETrequest.length(); } // Retrieves data using recv() and stuffs the bytes into partialData // until encountering CRLF CRLF, at which point it sticks the status // line, headers, and CRLF CRLF into completeResponse (removing them // from partialData). // // Returns true when completeResponse has the response in it, false if // the socket is closed before a complete response is received. bool readResponse(string &completeResponse) { const int bufSize = 256; char buf[bufSize]; string::size_type pos; while ((pos = partialData.find("\r\n\r\n")) == string::npos) { int amtRecvd = recv(sock, buf, bufSize, 0); if (amtRecvd == 0) { cout << "ERROR -- server closed socket before sending full response" << endl; return false; } else if (amtRecvd < 0) { perror("ERROR -- bad status from recv()"); return false; } // Append all bytes received. partialData.append(buf, amtRecvd); // Search incomplete buffer for the blank line. } // Grab everything up to the blank line, put into complete buffer. // The constant 4 here comes from the length of CRLF/CRLF. completeResponse = partialData.substr(0, pos + 4); // Print status line and headers to stdout. cout << completeResponse.substr(0, completeResponse.length() - 4) << endl; // Remove the stuff we just grabbed from partial. partialData.erase(0, pos + 4); return true; } // Search for content length, chunked, and persistent headers. void parseHeaders(string &headers) { // Content-Length: char *contentlength = "Content-Length:[ \t]*([[:digit:]]+)[ \t]*\r\n"; regex_t re; int ret; if (regcomp(&re, contentlength, REG_EXTENDED | REG_ICASE) != 0) // Probably we won't be able to build any other patterns // given that this one failed. return; const int nmatches = 2; regmatch_t matches[nmatches]; ret = regexec(&re, headers.c_str(), nmatches, matches, 0); regfree(&re); if (ret == 0) { string scl = headers.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); headers.erase(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so); contentLength = atoi(scl.c_str()); } // Transfer-Encoding: chunked char *chunkedpattern = "Transfer-Encoding:[ \t]*chunked[ \t]*\r\n"; if (regcomp(&re, chunkedpattern, REG_EXTENDED | REG_ICASE) != 0) return; ret = regexec(&re, headers.c_str(), nmatches, matches, 0); regfree(&re); if (ret == 0) { chunked = true; headers.erase(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so); } // Connection: keep-alive const char *keepalivepattern = "connection:[ \t]*keep-alive[ \t]*\r\n"; if (regcomp(&re, keepalivepattern, REG_EXTENDED | REG_ICASE) != 0) return; ret = regexec(&re, headers.c_str(), nmatches, matches, 0); regfree(&re); if (ret == 0) { serverSupportsPersistent = true; headers.erase(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so); } } // Handles HTTP error messages (400,404,501) and parses 200 response headers. // Returns false if a parsing error occurred and socket should be closed. bool parseResponse(string &response) { // Match the status line: // HTTP-Version SP Status-Code SP Reason-Phrase CRLF char *statusline = "HTTP/[[:digit:]]+\\.[[:digit:]]+[ \t]+([[:digit:]]+[ \t]+[^(\r\n)]*)\r\n"; regex_t re; int ret; if (regcomp(&re, statusline, REG_EXTENDED) != 0) return false; const int nmatches = 2; regmatch_t matches[nmatches]; ret = regexec(&re, response.c_str(), nmatches, matches, 0); regfree(&re); if (ret != 0) { cout << "ERROR -- could not parse server's response" << endl; return false; } // Capture the status code and reason phrase. string codeAndPhrase = response.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); // Now delete the status line from response. response.erase(0, matches[0].rm_eo); // Check for status code == 200. char *statuscode = "^200[ \t]+"; if (regcomp(&re, statuscode, REG_EXTENDED) != 0) { cout << "ERROR -- regcomp failed" << endl; return false; } // Constant 1 in the following call reflects that we're want // exactly 1 match, i.e., the beginning of the string. ret = regexec(&re, codeAndPhrase.c_str(), 1, matches, 0); regfree(&re); if (ret != 0) { // not 200 cout << codeAndPhrase << endl; parseHeaders(response); return true; } // Set flags/counters to default value. contentLength = 0; chunked = false; serverSupportsPersistent = false; // Get flags/counters from headers. parseHeaders(response); receiving = true; return true; } // Reads from sock until partialData contains a CRLF. // Returns false if socket closed. bool readUpToCRLF() { unsigned int bufsize = 128; char buf[bufsize]; while (partialData.find("\r\n") == string::npos) { int bytesRecvd = recv(sock, buf, bufsize, 0); if (bytesRecvd < 1) { cout << "ERROR -- socket closed unexpectedly" << endl; return false; } partialData.append(buf, bytesRecvd); } return true; } // Removes chunk size and the CRLF from the beginning of buffer. // Returns true if successful. bool chunkSize(string &buffer, unsigned int &csize) { char *sizepattern = "^([[:digit:]abcdef]+)\r\n"; regex_t re; int ret; if (regcomp(&re, sizepattern, REG_EXTENDED | REG_ICASE) != 0) return false; const int nmatches = 2; regmatch_t matches[nmatches]; ret = regexec(&re, buffer.c_str(), nmatches, matches, 0); regfree(&re); if (ret != 0) { cout << "ERROR -- bad chunk size" << endl; return false; } // Capture the chunk size. string size = buffer.substr(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so); sscanf(size.c_str(), "%x", &csize); // Now delete the chunk size and CRLF from buffer. buffer.erase(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so); return true; } // Writes "total" number of bytes to disk, first taking from partialData, // then from the socket. // Returns false if socket was closed prematurely. bool writeBytes(unsigned int total, ofstream &out) { unsigned int bufsize = 1024; char buf[bufsize]; int bytesRecvd; if ( total == 0 ) return true; // First write what's in partialData. unsigned int amt = min((unsigned int) partialData.length(), total); out.write(partialData.c_str(), amt); // Delete what we wrote. partialData.erase(0, amt); total -= amt; // Keep reading/writing until we've done all of total. while (total > 0) { amt = min(total, bufsize); bytesRecvd = recv(sock, buf, amt, 0); if (bytesRecvd < 1) { cout << "ERROR -- connection closed before the full file was received" << endl; return false; } out.write(buf, bytesRecvd); total -= bytesRecvd; } return true; } // Saves the file the server is sending to disk. // Returns false if socket should be closed. bool saveToDisk(ofstream &out) { unsigned int bufsize = 1024; char buf[bufsize]; int bytesRecvd; if (!chunked && contentLength == 0) { // Save to disk until the server closes the connection. out.write(partialData.c_str(), partialData.length()); do { bytesRecvd = recv(sock, buf, bufsize, 0); if (bytesRecvd > 0) out.write(buf, bytesRecvd); } while (bytesRecvd > 0); return false; } if (contentLength > 0) // Read contentLength bytes and write them to the file. return writeBytes(contentLength, out); // If we got this far, then it's a chunked transfer. // Because we've structured our routines to consume from // the socket and write to a file, we now open up a null // file that we can hand to those routines when we want to // skip over the framing goop that occurs inside a chunked // transfer. ofstream devnull("/dev/null", ios::out | ios::binary); if (!devnull.is_open()) return false; // bizarre - couldn't open /dev/null ... while (true) { // Each iteration processes one chunk. // Read the chunk size. unsigned int chunksize; // Each chunk size is followed by a CRLF. if (!readUpToCRLF()) return false; if (!chunkSize(partialData, chunksize)) // No luck reading it. return false; if (chunksize == 0) { // We've read the final chunk. Throw away // the associated goop. // Read the CRLF following the chunksize if (!readUpToCRLF()) return false; // throw away the final CRLF. if (!writeBytes(2, devnull)) return false; // We read the last chunk (the 0 size chunk), so return. return true; } // Read the full chunk and write to disk. if (!writeBytes(chunksize, out)) return false; // Get rid of the CRLF after the data. if (!writeBytes(2, devnull)) return false; } } // Deal with the server's response, saving file to disk if it sends us one. // Return false if the socket should be closed. bool handleResponse(string fileName) { string response; receiving = false; if (!readResponse(response)) return false; if (!parseResponse(response)) return false; if (!receiving) return persistent && serverSupportsPersistent; // The beauty of this object is that when it goes out of scope, it // is automatically close()ed for us. ofstream outfile(fileName.c_str(), ios::out | ios::binary); if (!outfile.is_open()) { cout << "ERROR -- could not open " << fileName << " for writing" << endl; return false; } if (!saveToDisk(outfile)) return false; return true; } // Returns a string that is the file name that we should save the server's // response to, generated according to the rules. string generateFileName(string requestedURL) { string::size_type pos; if (requestedURL == "/") // It's just a bare slash. return string("dir"); // If requestedURL is /foo/bar/ (it ends in '/') ... if (requestedURL.at(requestedURL.length() - 1) == '/') { // Take everything between the last 2 '/' (e.g., "bar"). pos = requestedURL.rfind('/', requestedURL.length() - 2); string betweenSlashes = requestedURL.substr(pos + 1, requestedURL.length() - pos - 2); if (betweenSlashes == "..") return string("dotdot"); if (betweenSlashes == ".") return string("dot"); return betweenSlashes; } // The requested URL does not end with '/', so take everything after // the last '/'. pos = requestedURL.rfind('/'); string afterLastSlash = requestedURL.substr(pos + 1); if (afterLastSlash == "..") return string("dotdot"); if (afterLastSlash == ".") return string("dot"); return afterLastSlash; } // Keep grabbing lines from standard in, parsing each line, connecting // to the server, sending them over, and receiving the response. int main(int argc, char **argv) { string partialLine; string fullURL, path, newHost, newPort; bool connected = false; string currentHost, currentPort; int option; while ((option=getopt(argc, argv, "vp")) != -1) { if (option=='v') verbose=true; else if (option=='p') persistent = true; else if (option=='?') { printf("ERROR -- usage is: http_client [-p] [-v]\n"); return -1; } } // Loop while stdin is not EOF or some error. while (grabFromStdin(partialLine, fullURL)) { if (!parseURL(fullURL, newHost, newPort, path)) // Bad URL. Skip it and get another. continue; if (verbose) cout << "Host: " << newHost << "\nPort: " << newPort << "\nPath: " << path << endl; // Connect to a new server if disconnected or host/port // has changed. if (!connected || currentHost != newHost || currentPort != newPort) { if (connected) { // Disconnect from old server. close(sock); connected = false; } // Remember new host info. currentHost = newHost; currentPort = newPort; if (!connectToServer(currentHost, atoi(currentPort.c_str()), sock)) { close(sock); continue; } } // Now we're connected. connected = true; if (!sendRequest(path, sock)) { connected = false; close(sock); printf("ERROR -- Network Error: could not send GET request\n"); continue; } string fileName = generateFileName(path); if(verbose) cout << "Name of file we are saving: " << fileName << endl; if (!handleResponse(fileName) || !persistent) { // Close socket if the handleResponse call failed or // if we don't want a persistent connection. connected = false; close(sock); } } return 0; }