views:

159

answers:

3

Hi Guys,

Sorry to bother you guys again, but here's my dilemma.

There must be a "better" regular expression to identify HTML link from a paragraph text (there can be more than 1 html links in the text). How do I extract all the link and anchor it in javascript?

My attempt (in javascript) is like this:

var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";

function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

//s is of type String

//For testing...
var text = "Check this video out http://ww w.youtube.com/watch?v=y3U3R3b1dOg or http://ww w.youtube.com/watch?v=sX6Vm0MoPCY";
alert(extractURLs(text));

(spaces on hyperlink has been deliberately added here to allow posting of question in SO). Result: I only get the 1st hyperlink and not the second one.... Has anybody done something similar or better that I can utilize?

Thanks in advance.

A: 
var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";


function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

var text = "Check this video out http://www.youtube.com/watch?v=y3U3R3b1dOg or http://www.youtube.com/watch?v=sX6Vm0MoPCY";
var results = extractURLs(text);

alert(extractURLs(results[0]  + ", " + results[1])); 
Ralph Stevens
That I know....but it's not returning 2 links on the text...only the first one.
The Elite Gentleman
Look at my edited response. I modified your code.
Ralph Stevens
results[1] gives me "http" so that doesn't retrieve the 2nd url from the text string.
The Elite Gentleman
Alsciende's response worked....thanks for your help!
The Elite Gentleman
+2  A: 

Use the "g" modifier:

function extractURLs(s) {
    return s.match(new RegExp(urlPattern, "g"));
}
Alsciende
Thanks, this worked! :-)
The Elite Gentleman
A: 

It is better to write it as,

var urlPattern = /(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?/g;

function extractURLs(s) {
    return s.match(urlPattern);
}

Here urlPattern is pre-compiled, rather than compiling the RegEx everytime the function is called, hence results in petter performance.

Livingston Samuel
True, but extractURLs(...) isn't the only function available, there's functions like isValidURL(url) that uses urlPattern and some that are prep-ended or post-ended with other expressions.
The Elite Gentleman
actually your urlPattern will fail to compile....Solution: var urlPattern = /(https?|ftp)://(www\.)?(((([a-zA-Z0-9.-]+\.){1,}[a-zA-Z]{2,4}|localhost))|((\d{1,3}\.){3}(\d{1,3})))(:(\d+))?(/([a-zA-Z0-9-._~!$=:@/]|%[0-9A-F]{2})*)?(\?([a-zA-Z0-9-._~!$=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?;You should have removed the /g and replaced the \\ to \
The Elite Gentleman